How To Get All The Contiguous Substrings Of A String In Python? - python

Here is my code, but I want a better solution, how do you think about the problem?
def get_all_substrings(string):
length = len(string)
alist = []
for i in xrange(length):
for j in xrange(i,length):
alist.append(string[i:j + 1])
return alist
print get_all_substring('abcde')

The only improvement I could think of is, to use list comprehension like this
def get_all_substrings(input_string):
length = len(input_string)
return [input_string[i:j+1] for i in xrange(length) for j in xrange(i,length)]
print get_all_substrings('abcde')
The timing comparison between, yours and mine
def get_all_substrings(string):
length = len(string)
alist = []
for i in xrange(length):
for j in xrange(i,length):
alist.append(string[i:j + 1])
return alist
def get_all_substrings_1(input_string):
length = len(input_string)
return [input_string[i:j + 1] for i in xrange(length) for j in xrange(i,length)]
from timeit import timeit
print timeit("get_all_substrings('abcde')", "from __main__ import get_all_substrings")
# 3.33308315277
print timeit("get_all_substrings_1('abcde')", "from __main__ import get_all_substrings_1")
# 2.67816185951

can be done concisely with itertools.combinations
from itertools import combinations
def get_all_substrings_2(string):
length = len(string) + 1
return [string[x:y] for x, y in combinations(range(length), r=2)]

You could write it as a generator to save storing all the strings in memory at once if you don't need to
def get_all_substrings(string):
length = len(string)
for i in xrange(length):
for j in xrange(i + 1, length + 1):
yield(string[i:j])
for i in get_all_substrings("abcde"):
print i
you can still make a list if you really need one
alist = list(get_all_substrings("abcde"))
The function can be reduced to return a generator expression
def get_all_substrings(s):
length = len(s)
return (s[i: j] for i in xrange(length) for j in xrange(i + 1, length + 1))
Or of course you can change two characters to return a list if you don't care about memory
def get_all_substrings(s):
length = len(s)
return [s[i: j] for i in xrange(length) for j in xrange(i + 1, length + 1)]

I've never been fond of range(len(seq)), how about using enumerate and just using the index value:
def indexes(seq, start=0):
return (i for i,_ in enumerate(seq, start=start))
def gen_all_substrings(s):
return (s[i:j] for i in indexes(s) for j in indexes(s[i:], i+1))
def get_all_substrings(string):
return list(gen_all_substrings(string))
print(get_all_substrings('abcde'))

Python 3
s='abc'
list(s[i:j+1] for i in range (len(s)) for j in range(i,len(s)))
['a', 'ab', 'abc', 'b', 'bc', 'c']

Use itertools.permutations to generate all pairs of possible start and end indexes,
and filter out only those where the start index is less than then end index. Then
use these pairs to return slices of the original string.
from itertools import permutations
def gen_all_substrings(s):
lt = lambda pair: pair[0] < pair[1]
index_pairs = filter(lt, permutations(range(len(s)+1), 2))
return (s[i:j] for i,j in index_pairs)
def get_all_substrings(s):
return list(gen_all_substrings(s))
print(get_all_substrings('abcde'))

Another solution:
def get_all_substrings(string):
length = len(string)+1
return [string[x:y] for x in range(length) for y in range(length) if string[x:y]]
print get_all_substring('abcde')

Another solution using 2-D matrix approach
p = "abc"
a = list(p)
b = list(p)
c = list(p)
count = 0
for i in range(0,len(a)):
dump = a[i]
for j in range(0, len(b)):
if i < j:
c.append(dump+b[j])
dump = dump + b[j]

If you want to get the substrings sorted by the length:
s = 'abcde'
def allSubstrings(s: str) -> List[str]:
length = len(s)
mylist = []
for i in range(1, length+1):
for j in range(length-i+1):
mylist.append(s[j:j+i])
return mylist
print(allSubstrings(s))
['a', 'b', 'c', 'd', 'e', 'ab', 'bc', 'cd', 'de', 'abc', 'bcd', 'cde', 'abcd', 'bcde', 'abcde']

Related

How to get multiple most frequent k-mers of a string using Python?

If I insert the following
Insert the Text:
ACACACA
Insert a value for k:
2
For the following codes
print("Insert the Text:")
Text = input()
print("Insert a value for k:")
k = int(input())
Pattern = " "
count = [ ]
FrequentPatterns = [ ]
def FrequentWords(Text, k):
for i in range (len(Text)-k+1):
Pattern = Text[i: i+k]
c = 0
for i in range (len(Text)-len(Pattern)+1):
if Text[i: i+len(Pattern)] == Pattern:
c = c+1
else:
continue
count.extend([c])
print(count)
if count[i] == max(count):
FrequentPatterns.extend([Pattern])
return FrequentPatterns
FrequentWords(Text, k)
I get the following out put
Insert the Text:
ACACACA
Insert a value for k:
2
[3, 3, 3, 3, 3, 3]
['CA']
Clearly there are two FrequentPatterns. So the last list output should be ['AC', 'CA']
I don't know why this code isn't working. Really appreciate if anyone could help.
Here's how would solve this:
from itertools import groupby
def find_kgrams(string, k):
kgrams = sorted(
string[j:j+k]
for i in range(k)
for j in range(i, (len(string) - i) // k * k, k)
)
groups = [(k, len(list(g))) for k, g in groupby(kgrams)]
return sorted(groups, key=lambda i: i[1], reverse=True)
The way this works is:
it produces string chunks of the given length k, e.g.:
starting from 0: 'ACACACA' -> 'AC', 'AC', 'AC'
starting from 1: 'ACACACA' -> 'CA', 'CA', 'CA'
...up to k - 1 (1 is the maximum for k == 2)
groupby() groups those chunks
sorted() sorts them by count
a list of tuples of kgrams and their count is returned
Test:
s = 'ACACACA'
kgrams = find_kgrams(s, 2)
print(kgrams)
prints:
[('AC', 3), ('CA', 3)]
It's already sorted, you can pick the most frequent one(s) from the front of the returned list:
max_kgrams = [k for k, s in kgrams if s == kgrams[1][1])
print(max_kgrams)
prints:
['AC', 'CA']

Python: Combining numbers in a list

I have a list, say
x = [0,1,2,3,"a","b","cd"]
I want to keep the smallest number and all of the letters, hence in the example it would become
x = [0,"a","b","cd"]
How can I do this? Ideally the code would be very efficient, as I'm doing this for millions of lists.
Attempts: I've tried finding min(x), however it results in an error as there are strings in the list
I think the below code is most efficient. It does not take extra memory and complexity is O(N)
import sys
x = [0, 1, 2, 3, "a", "b", "cd"]
minimum = sys.maxsize # for python 3.x
# minimum = sys.maxint #for python 2
j = 0
for i in range(len(x)):
if isinstance(x[i], str):
x[j] = x[i]
j+=1
else:
minimum = min(minimum, x[i])
print([minimum]+x[:j])
Output
[0, 'a', 'b', 'cd']
try this:
Python 2.7
output = [s for s in x if isinstance(s, str)]
output.append(min(x))
#>>> output
#['a', 'b', 'cd', 0]
Python3:
output = [s for s in x if isinstance(s, str)]
output.append(min([i for i in x if isinstance(i, int)]))
You can use itertools.groupby
>>> x = [0,1,2,3,"a","b","cd"]
>>> [min(n, *g) if t == int else n for t, g in groupby(x, type) for n in g]
[0, 'a', 'b', 'cd']
More efficient would be to just min the integers and unpack the strings.
>>> x = [0,1,2,3,"a","b","cd"]
>>> grouped = [list(g) for t, g in groupby(x, type)]
>>> [min(grouped[0]), *grouped[1]]
[0, 'a', 'b', 'cd']
One option would be to use the .isnumeric() method to find the minimum number while building a new list for the strings. This should be O(n). Not super fast, but not slow either.
You could say something like:
min_number = None
string_list = []
for i in x:
if i.isnumeric():
if min_number is None or i < min_number:
min_number = i
elif isinstance(i, str):
string_list.append(i)
if min_number is not None:
x = string_list.insert(0, min_number)
I don't think this would be the most efficient but you could separate the list into two lists - one with ints and one with strings - and then find the min then rejoin them. This would look something like:
x = [0, 1, 2, 'a', 'b', 'c']
nums = []
strings = []
for item in x:
if isinstance(item, int):
nums.append(item)
else:
strings.append(item)
Now, after you run this, you can get the min and then rejoin the lists
result = [min(nums)] + chars
This will give [0, 'a', 'b', 'c']
Something like this should work
def minNum(array):
min = None
numPos = []
for i in array:
if type(i) == int or type(i) == float:
if min is None or i < min:
min = i
numPos.append(array.index(i))
else:
numPos.append(array.index(i))
else:
pass
numPos.reverse()
for j in numPos:
if array[j] != min:
del array[j]
return array
Definitely not the only solution but its fairly compact and works well for all the test cases I gave

next steps to make this recursive and more pythonic as well

My ultimate goal is to sum a list of values by summing each pair in the list, producing a smaller list, summing each of those pairs, and so on. For now, I have a list with an even number of elements, but eventually I want to be able to handle an arbitrary sized list.
With some help from previous posts, I have this function:
def collapse_sum(lst):
sums = list()
rems = list()
acc = 0
n = 2
print(lst)
print("final sum: {}\n".format(sum(lst)))
pairs = [lst[i:i + n] for i in range(0, len(lst), n)]
while len(pairs) > 0:
if len(pairs) % 2 == 1:
rems.append(pairs[-1])
del pairs[-1]
for a,b in pairs:
sums.append(a + b)
pairs = [sums[i:i + 2] for i in range(0,len(sums),n)]
sums = list()
if len(pairs) == 1:
for a,b in pairs:
acc += a + b
rems = [item for sublist in rems for item in sublist]
pairs = [rems[i:i + n] for i in range(0, len(rems), n)]
if len(pairs) == 1:
for a,b in pairs:
acc += a + b
del pairs[-1]
return acc
rems = list()
print(acc)
return acc
My ultimate goal is to sum a list of values by summing each pair in
the list, producing a smaller list, summing each of those pairs, and
so on.
Here's a solution that does that for even or odd-sized lists, without using any library dependencies.
# Helper function to split iterable obj into blocks/chunks
def blocks(obj, n):
for i in range(0, len(obj), n):
yield obj[i:i + n]
def collapse_sum(obj):
while len(obj) > 1:
obj = blocks(obj, 2)
obj = [sum(i) for i in obj]
return obj[0]
Some examples:
a = [1, 2, 3, 4, 5]
b = [1, 2, 3, 4]
collapse_sum(a)
15
collapse_sum(b)
10
You can visualize this here.

Find the position of the longest repeated letter

I have a file that contains letters. I need to find the position of the longest repeated letters. For example, if the file contains aaassdddffccsdddfgssfrsfspppppppppppddsfs, I need a program that finds the position of ppppppppppp. I know that I need to use a .index function to find the location however I am stuck on the loop.
Using itertools.groupby:
import itertools
mystr = 'aaassdddffccsdddfgssfrsfspppppppppppddsfs'
idx = 0
maxidx, maxlen = 0, 0
for _, group in itertools.groupby(mystr):
grouplen = sum(1 for _ in group)
if grouplen > maxlen:
maxidx, maxlen = idx, grouplen
idx += grouplen
Gives the idx and the length of the longest identical substring:
>>> print(maxidx, maxlen)
25, 11
>>> mystr[25:25+11]
'ppppppppppp'
You're going to need to loop through the entire string. Keep track of each new letter you come across as well as it's index and how long each sequence is. Only store the max sequence
s = 'aaassdddffccsdddfgssfrsfspppppppppppddsfs'
max_c = max_i = max_len = None
cur_c = cur_i = cur_len = None
for i, c in enumerate(s):
if c != cur_c:
if max_len is None or cur_len > max_len:
max_c, max_i, max_len = cur_c, cur_i, cur_len
cur_c = c
cur_i = i
cur_len = 1
else:
cur_len += 1
else:
# One last check when the loop completes
if max_len is None or cur_len > max_len:
max_c, max_i, max_len = cur_c, cur_i, cur_len
print max_c, max_i, max_len
Here is an oneliner
from itertools import groupby
from functools import reduce
[(k, next(g)[0], sum(1 for _ in g)+1) for k, g in groupby(enumerate(
'aaassdddffccsdddfgssfrsfspppppppppppddsfs'), key=itemgetter(1))]
The above generates (key, position, length). You can get the maximum length by
applying reduce
from itertools import groupby
from functools import reduce
from operator import itemgetter
reduce(lambda x,y:x if x[2] >= y[2] else y,
((k, next(g)[0], sum(1 for _ in g)+1) for k, g in groupby(enumerate(
'aaassdddffccsdddfgssfrsfspppppppppppddsfs'), key=itemgetter(1))))
A quick way of achieving this is to use a regex to match repeating characters with (.)(\1+). Then we loop over all those results using a generator comprehension and find the max according to the length (key=len). Finally having found the largest string, we call thestr.index() to find where the longest repeated letter occurred:
import re
txt = "aaassdddffccsdddfgssfrsfspppppppppppddsfs"
idx = txt.index(max((''.join(f) for f in re.findall(r"(.)(\1+)", txt)), key=len))
print(idx)
Here is the same code broken out into stages:
>>> import re
>>> txt = "aaassdddffccsdddfgssfrsfspppppppppppddsfs"
>>> matches = list(''.join(f) for f in re.findall(r"(.)(\1+)", txt))
>>> print(matches)
['aaa', 'ss', 'ddd', 'ff', 'cc', 'ddd', 'ss', 'ppppppppppp', 'dd']
>>> longest = max(matches, key=len)
>>> print(longest)
ppppppppppp
>>> print(txt.index(longest))
25

How to remove a string from a list of strings if its length is lower than the length of the string with max length in Python 2.7?

How to remove a string from a list of strings if its length is lower than the length of the string with max length in Python 2.7?
Basically, if I have a list such as:
test = ['cat', 'dog', 'house', 'a', 'range', 'abc']
max_only(test)
The output should be:
['house', 'range']
'cat''s length is 3, 'dog' is 3, 'house' is 5, 'a' is 1, 'range' is 5, 'abc' is 3. The string with the highest length are 'house' and 'range', so they're returned.
I tried with something like this but, of course, it doesn't work :)
def max_only(lst):
ans_lst = []
for i in lst:
ans_lst.append(len(i))
for k in range(len(lst)):
if len(i) < max(ans_lst):
lst.remove(lst[ans_lst.index(max(ans_lst))])
return lst
Could you help me?
Thank you.
EDIT: What about the same thing for the min length element?
Use a list comprehension and max:
>>> test = ['cat', 'dog', 'house', 'a', 'range', 'abc']
>>> max_ = max(len(x) for x in test) #Find the length of longest string.
>>> [x for x in test if len(x) == max_] #Filter out all strings that are not equal to max_
['house', 'range']
A solution that loops just once:
def max_only(lst):
result, maxlen = [], -1
for item in lst:
itemlen = len(item)
if itemlen == maxlen:
result.append(item)
elif itemlen > maxlen:
result[:], maxlen = [item], itemlen
return result
max(iterable) has to loop through the whole list once, and a list comprehension picking out items of matching length has to loop through the list again. The above version loops through the input list just once.
If your input list is not a sequence but an iterator, this algorithm will still work while anything that has to use max() won't; it'd have exhausted the iterator just to find the maximum length.
Timing comparison on 100 random words between length 1 and 9, repeated 1 million times:
>>> import timeit
>>> import random
>>> import string
>>> words = [''.join([random.choice(string.ascii_lowercase) for _ in range(1, random.randrange(11))]) for _ in range(100)]
>>> def max_only(lst):
... result, maxlen = [], -1
... for item in lst:
... itemlen = len(item)
... if itemlen == maxlen:
... result.append(item)
... elif itemlen > maxlen:
... result[:], maxlen = [item], itemlen
... return result
...
>>> timeit.timeit('f(words)', 'from __main__ import max_only as f, words')
23.173006057739258
>>> def max_listcomp(lst):
... max_ = max(len(x) for x in lst)
... return [x for x in lst if len(x) == max_]
>>> timeit.timeit('f(words)', 'from __main__ import max_listcomp as f, words')
36.34060215950012
Replacing result.append() with a cached r_append = result.append outside the for loop shaves off another 2 seconds:
>>> def max_only(lst):
... result, maxlen = [], -1
... r_append = result.append
... for item in lst:
... itemlen = len(item)
... if itemlen == maxlen:
... r_append(item)
... elif itemlen > maxlen:
... result[:], maxlen = [item], itemlen
... return result
...
>>> timeit.timeit('f(words)', 'from __main__ import max_only as f, words')
21.21125817298889
And by popular request, a min_only() version:
def min_only(lst):
result, minlen = [], float('inf')
r_append = result.append
for item in lst:
itemlen = len(item)
if itemlen == minlen:
r_append(item)
elif itemlen < minlen:
result[:], minlen = [item], itemlen
return result
More fun still, a completely different tack: sorting on length:
from itertools import groupby
def max_only(lst):
return list(next(groupby(sorted(lst, key=len, reverse=True), key=len))[1])[::-1]
def min_only(lst):
return list(next(groupby(sorted(lst, key=len), key=len))[1])
These work by sorting by length, then picking out the first group of words with equal length. For max_only() we need to sort in reverse, then re-reverse the result. Sorting has a O(NlogN) cost, making this less efficient than the O(2N) solutions in other answers here or my O(N) solution above:
>>> timeit.timeit('f(words)', 'from __main__ import max_only_sorted as f, words')
52.725801944732666
Still, the sorting approach gives you a fun one-liner.
You can use max() which returns the largest item in the list.
>>> len_max = len(max(test, key=len))
>>> [x for x in test if len(x) == len_max]
['house', 'range']
If you then take all the strings that have the same length as the element you get the desired result.
>>> test = ['cat', 'dog', 'house', 'a', 'range', 'abc']
>>> filter(lambda x,m=max(map(len, test)):len(x)==m, test)
['house', 'range']
For Python3.x you would need to use list(filter(...))
This works:
max_len = len(max(test, key=len))
result = [word for word in test if len(word) == max_len]

Categories