How to find the longest common substring between two strings using Python? - python

I want to write a Python code that computes the longest common substring between two strings from the input.
Example:
word1 = input('Give 1. word: xlaqseabcitt')
word2 = input('Give 2. word: peoritabcpeor')
Wanted output:
abc
I have code like this so far:
word1 = input("Give 1. word: ")
word2 = input("Give 2. word: ")
longestSegment = ""
tempSegment = ""
for i in range(len(word1)):
if word1[i] == word2[i]:
tempSegment += word1[i]
else:
tempSegment = ""
if len(tempSegment) > len(longestSegment):
longestSegment = tempSegment
print(longestSegment)
I end up with IndexError when word2 is shorter than word1, and it does not give me the common substring.
EDIT: I found this solution:
string1 = input('Give 1. word: ')
string2 = input('Give 2. word: ')
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp=0
match=''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp+=1
if (len(match) > len(answer)):
answer = match
print(answer)
However, I would like to see a library function call that could be used to compute the longest common substring between two strings.
Alternatively, please suggest a more concise code to achieve the same.

You can build a dictionary from the first string containing the positions of each character, keyed on the characters. Then go through the second string and compare the substring of each character with the rest of the second string at that position:
# extract common prefix
def common(A,B) :
firstDiff = (i for i,(a,b) in enumerate(zip(A,B)) if a!=b) # 1st difference
commonLen = next(firstDiff,min(len(A),len(B))) # common length
return A[:commonLen]
word1 = "xlaqseabcitt"
word2 = "peoritabcpeor"
# position(s) of each character in word1
sub1 = dict()
for i,c in enumerate(word1): sub1.setdefault(c,[]).append(i)
# maximum (by length) of common prefixes from matching first characters
maxSub = max((common(word2[i:],word1[j:])
for i,c in enumerate(word2)
for j in sub1.get(c,[])),key=len)
print(maxSub) # abc

For me, looks like the solution that works is using the suffix_trees package:
from suffix_trees import STree
a = ["xxx ABC xxx", "adsa abc"]
st = STree.STree(a)
print(st.lcs()) # "abc"

Here is an answer if you later want to compute any number of strings. It should return the longest common substring. It work with the different test i gave it. (as long as you don't use the '§' character)
It is not a library but you can still import the functions in your code just like a library. You can use the same logic with your own code (only for two strings.) Do so as follows (put both files in the same directory for the sake of simplicity). I am supposing you will call the file findmatch.py.
import findmatch
longest_substring = findmatch.prep(['list', 'of', 'strings'])
Here is the code that should be in 'findmatch.py'.
def main(words,first):
nextreference = first
reference = first
for word in words:
foundsub = False
print('reference : ',reference)
print('word : ', word)
num_of_substring = 0
length_longest_substring = 0
for i in range(len(word)):
print('nextreference : ', nextreference)
letter = word[i]
print('letter : ', letter)
if word[i] in reference:
foundsub = True
num_of_substring += 1
locals()['substring'+str(num_of_substring)] = word[i]
print('substring : ', locals()['substring'+str(num_of_substring)])
for j in range(len(reference)-i):
if word[i:i+j+1] in reference:
locals()['substring'+str(num_of_substring) ]= word[i:i+j+1]
print('long_sub : ',locals()['substring'+str(num_of_substring)])
print('new : ',len(locals()['substring'+str(num_of_substring)]))
print('next : ',len(nextreference))
print('ref : ', len(reference))
longer = (len(reference)<len(locals()['substring'+str(num_of_substring)]))
longer2 = (len(nextreference)<len(locals()['substring'+str(num_of_substring)]))
if (num_of_substring==1) or longer or longer2:
nextreference = locals()['substring'+str(num_of_substring)]
if not foundsub:
for i in range(len(words)):
words[i] = words[i].replace(reference, '§')
#§ should not be used in any of the strings, put a character you don't use here
print(words)
try:
nextreference = main(words, first)
except Exception as e:
return None
reference = nextreference
return reference
def prep(words):
first = words[0]
words.remove(first)
answer = main(words, first)
return answer
if __name__ == '__main__':
words = ['azerty','azertydqse','fghertqdfqf','ert','sazjjjjjjjjjjjert']
#just some absurd examples any word in here
substring = prep(words)
print('answer : ',substring)
It is basically creating your own library.
I hope this aswers helps someone.

Here is a recursive solution :
def lcs(X, Y, m, n):
if m == 0 or n == 0:
return 0
elif X[m - 1] == Y[n - 1]:
return 1 + lcs(X, Y, m - 1, n - 1);
else:
return max(lcs(X, Y, m, n - 1), lcs(X, Y, m - 1, n));

Since someone asked for a multiple-word solution, here's one:
def multi_lcs(words):
words.sort(key=lambda x:len(x))
search = words.pop(0)
s_len = len(search)
for ln in range(s_len, 0, -1):
for start in range(0, s_len-ln+1):
cand = search[start:start+ln]
for word in words:
if cand not in word:
break
else:
return cand
return False
>>> multi_lcs(['xlaqseabcitt', 'peoritabcpeor'])
'abc'
>>> multi_lcs(['xlaqseabcitt', 'peoritabcpeor', 'visontatlasrab'])
'ab'

for small strings, copy this into a file in your project, let's say string_utils.py
def find_longest_common_substring(string1, string2):
s1 = string1
s2 = string2
longest_substring = ""
longest_substring_i1 = None
longest_substring_i2 = None
# iterate through every index (i1) of s1
for i1, c1 in enumerate(s1):
# for each index (i2) of s2 that matches s1[i1]
for i2, c2 in enumerate(s2):
# if start of substring
if c1 == c2:
delta = 1
# make sure we aren't running past the end of either string
while i1 + delta < len(s1) and i2 + delta < len(s2):
# if end of substring
if s2[i2 + delta] != s1[i1 + delta]:
break
# still matching characters move to the next character in both strings
delta += 1
substring = s1[i1:(i1 + delta)]
# print(f'substring candidate: {substring}')
# replace longest_substring if newly found substring is longer
if len(substring) > len(longest_substring):
longest_substring = substring
longest_substring_i1 = i1
longest_substring_i2 = i2
return (longest_substring, longest_substring_i1, longest_substring_i2)
Then it can be used as follows:
import string_utils
print(f"""(longest substring, index of string1, index of string2):
{ string_utils.find_longest_common_substring("stackoverflow.com", "tackerflow")}""")
For any that are curious the print statement when uncommented prints:
substring candidate: tack
substring candidate: ack
substring candidate: ck
substring candidate: o
substring candidate: erflow
substring candidate: rflow
substring candidate: flow
substring candidate: low
substring candidate: ow
substring candidate: w
substring candidate: c
substring candidate: o
(longest substring, index of string1, index of string2):
('erflow', 7, 4)

Here is a naive solution in terms of time complexity but simple enough to understand:
def longest_common_substring(a, b):
"""Find longest common substring between two strings A and B."""
if len(a) > len(b):
a, b = b, a
for i in range(len(a), 0, -1):
for j in range(len(a) - i + 1):
if a[j:j + i] in b:
return a[j:j + i]
return ''

A super fast library is available for Python: pylcs
It can find the indices of the longest common substring (LCS) between 2 strings, and can do some other related tasks as well.
A function to return the LCS using this library consists of 2 lines:
import pylcs
def find_LCS(s1, s2):
res = pylcs.lcs_string_idx(s1, s2)
return ''.join([s2[i] for i in res if i != -1])
Example:
s1 = 'bbbaaabaa'
s2 = 'abaabaab'
print(find_LCS(s1, s2))
aabaa
Explanation:
In this example res is:
[-1, -1, -1, -1, 2, 3, 4, 5, 6]
It is a mapping of all characters in s1 - to the indices of characters in s2 of the LCS.
-1 indicates that the character of s1 is NOT part of the LCS.
The reasons behind the speed and efficiency of this library are that it's implemented in C++ and uses dynamic programming.

Related

How to find the most amount of shared characters in two strings? (Python)

yamxxopd
yndfyamxx
Output: 5
I am not quite sure how to find the number of the most amount of shared characters between two strings. For example (the strings above) the most amount of characters shared together is "yamxx" which is 5 characters long.
xx would not be a solution because that is not the most amount of shared characters. In this case the most is yamxx which is 5 characters long so the output would be 5.
I am quite new to python and stack overflow so any help would be much appreciated!
Note: They should be the same order in both strings
Here is simple, efficient solution using dynamic programming.
def longest_subtring(X, Y):
m,n = len(X), len(Y)
LCSuff = [[0 for k in range(n+1)] for l in range(m+1)]
result = 0
for i in range(m + 1):
for j in range(n + 1):
if (i == 0 or j == 0):
LCSuff[i][j] = 0
elif (X[i-1] == Y[j-1]):
LCSuff[i][j] = LCSuff[i-1][j-1] + 1
result = max(result, LCSuff[i][j])
else:
LCSuff[i][j] = 0
print (result )
longest_subtring("abcd", "arcd") # prints 2
longest_subtring("yammxdj", "nhjdyammx") # prints 5
This solution starts with sub-strings of longest possible lengths. If, for a certain length, there are no matching sub-strings of that length, it moves on to the next lower length. This way, it can stop at the first successful match.
s_1 = "yamxxopd"
s_2 = "yndfyamxx"
l_1, l_2 = len(s_1), len(s_2)
found = False
sub_length = l_1 # Let's start with the longest possible sub-string
while (not found) and sub_length: # Loop, over decreasing lengths of sub-string
for start in range(l_1 - sub_length + 1): # Loop, over all start-positions of sub-string
sub_str = s_1[start:(start+sub_length)] # Get the sub-string at that start-position
if sub_str in s_2: # If found a match for the sub-string, in s_2
found = True # Stop trying with smaller lengths of sub-string
break # Stop trying with this length of sub-string
else: # If no matches found for this length of sub-string
sub_length -= 1 # Let's try a smaller length for the sub-strings
print (f"Answer is {sub_length}" if found else "No common sub-string")
Output:
Answer is 5
s1 = "yamxxopd"
s2 = "yndfyamxx"
# initializing counter
counter = 0
# creating and initializing a string without repetition
s = ""
for x in s1:
if x not in s:
s = s + x
for x in s:
if x in s2:
counter = counter + 1
# display the number of the most amount of shared characters in two strings s1 and s2
print(counter) # display 5

Find most common substring in a list of strings?

I have a Python list of string names where I would like to remove a common substring from all of the names.
And after reading this similar answer I could almost achieve the desired result using SequenceMatcher.
But only when all items have a common substring:
From List:
string 1 = myKey_apples
string 2 = myKey_appleses
string 3 = myKey_oranges
common substring = "myKey_"
To List:
string 1 = apples
string 2 = appleses
string 3 = oranges
However I have a slightly noisy list that contains a few scattered items that don't fit the same naming convention.
I would like to remove the "most common" substring from the majority:
From List:
string 1 = myKey_apples
string 2 = myKey_appleses
string 3 = myKey_oranges
string 4 = foo
string 5 = myKey_Banannas
common substring = ""
To List:
string 1 = apples
string 2 = appleses
string 3 = oranges
string 4 = foo
string 5 = Banannas
I need a way to match the "myKey_" substring so I can remove it from all names.
But when I use the SequenceMatcher the item "foo" causes the "longest match" to be equal to blank "".
I think the only way to solve this is to find the "most common substring". But how could that be accomplished?
Basic example code:
from difflib import SequenceMatcher
names = ["myKey_apples",
"myKey_appleses",
"myKey_oranges",
#"foo",
"myKey_Banannas"]
string2 = names[0]
for i in range(1, len(names)):
string1 = string2
string2 = names[i]
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print(string1[match.a: match.a + match.size]) # -> myKey_
Given names = ["myKey_apples", "myKey_appleses", "myKey_oranges", "foo", "myKey_Banannas"]
An O(n^2) solution I can think of is to find all possible substrings and storing them in a dictionary with the number of times they occur :
substring_counts={}
for i in range(0, len(names)):
for j in range(i+1,len(names)):
string1 = names[i]
string2 = names[j]
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
matching_substring=string1[match.a:match.a+match.size]
if(matching_substring not in substring_counts):
substring_counts[matching_substring]=1
else:
substring_counts[matching_substring]+=1
print(substring_counts) #{'myKey_': 5, 'myKey_apples': 1, 'o': 1, '': 3}
And then picking the maximum occurring substring
import operator
max_occurring_substring=max(substring_counts.iteritems(), key=operator.itemgetter(1))[0]
print(max_occurring_substring) #myKey_
Here's a overly verbose solution to your problem:
def find_matching_key(list_in, max_key_only = True):
"""
returns the longest matching key in the list * with the highest frequency
"""
keys = {}
curr_key = ''
# If n does not exceed max_n, don't bother adding
max_n = 0
for word in list(set(list_in)): #get unique values to speed up
for i in range(len(word)):
# Look up the whole word, then one less letter, sequentially
curr_key = word[0:len(word)-i]
# if not in, count occurance
if curr_key not in keys.keys() and curr_key!='':
n = 0
for word2 in list_in:
if curr_key in word2:
n+=1
# if large n, Add to dictionary
if n > max_n:
max_n = n
keys[curr_key] = n
# Finish the word
# Finish for loop
if max_key_only:
return max(keys, key=keys.get)
else:
return keys
# Create your "from list"
From_List = [
"myKey_apples",
"myKey_appleses",
"myKey_oranges",
"foo",
"myKey_Banannas"
]
# Use the function
key = find_matching_key(From_List, True)
# Iterate over your list, replacing values
new_From_List = [x.replace(key,'') for x in From_List]
print(new_From_List)
['apples', 'appleses', 'oranges', 'foo', 'Banannas']
Needless to say, this solution would look a lot neater with recursion. Thought I'd sketch out a rough dynamic programming solution for you though.
I would first find the starting letter with the most occurrences. Then I would take each word having that starting letter, and take while all these words have matching letters. Then in the end I would remove the prefix that was found from each starting word:
from collections import Counter
from itertools import takewhile
strings = ["myKey_apples", "myKey_appleses", "myKey_oranges", "berries"]
def remove_mc_prefix(words):
cnt = Counter()
for word in words:
cnt[word[0]] += 1
first_letter = list(cnt)[0]
filter_list = [word for word in words if word[0] == first_letter]
filter_list.sort(key = lambda s: len(s)) # To avoid iob
prefix = ""
length = len(filter_list[0])
for i in range(length):
test = filter_list[0][i]
if all([word[i] == test for word in filter_list]):
prefix += test
else: break
return [word[len(prefix):] if word.startswith(prefix) else word for word in words]
print(remove_mc_prefix(strings))
Out: ['apples', 'appleses', 'oranges', 'berries']
To find the most-common-substring from list of python-string
I already tested on python-3.10.5 I hope it will work for you.
I have the same use case but a different kind of task, I just need to find one common-pattern-string from a list of more than 100s files. To use as a regular-expression.
Your Basic example code is not working in my case. because 1st checking with 2nd, 2nd with 3rd, 3rd with 4th and so on. So, I change it to the most common substring and will check with each one.
The downside of this code is that if something is not common with the most common substring, the final most common substring will be an empty one.
But in my case, it is working.
from difflib import SequenceMatcher
for i in range(1, len(names)):
if i==1:
string1, string2 = names[0], names[i]
else:
string1, string2 = most_common_substring, names[i]
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
most_common_substring = string1[match.a: match.a + match.size]
print(f"most_common_substring : {most_common_substring}")
python python-3python-difflib

Removing duplicates if paired adjacently

Deleting any pair of adjacent letters with same value. For example, string "aabcc" would become either "aab" or "bcc" after operation.
Sample input = aaabccddd
Sample output = abd
Confused how to iterate the list or the string in a way to match the duplicates and removing them, here is the way I am trying and I know it is wrong.
S = input()
removals = []
for i in range(0, len(S)):
if i + 1 >= len(S):
break
elif S[i] == S[i + 1]:
removals.append(i)
# removals is to store all the indexes that are to be deleted.
removals.append(i + 1)
i += 1
print(i)
Array = list(S)
set(removals) #removes duplicates from removals
for j in range(0, len(removals)):
Array.pop(removals[j]) # Creates IndexOutOfRange error
This is a problem from Hackerrank: Super Reduced String
Removing paired letters can be reduced to reducing runs of letters to an empty sequence if there are an even number of them, 1 if there are an odd number. aaaaaa becomes empty, aaaaa is reduced to a.
To do this on any sequence, use itertools.groupby() and count the group size:
# only include a value if their consecutive count is odd
[v for v, group in groupby(sequence) if sum(1 for _ in group) % 2]
then repeat until the size of the sequence no longer changes:
prev = len(sequence) + 1
while len(sequence) < prev:
prev = len(sequence)
sequence = [v for v, group in groupby(sequence) if sum(1 for _ in group) % 2]
However, since Hackerrank gives you text it'd be faster if you did this with a regular expression:
import re
even = re.compile(r'(?:([a-z])\1)+')
prev = len(text) + 1
while len(text) < prev:
prev = len(text)
text = even.sub(r'', text)
[a-z] in a regex matches a lower-case letter, (..)groups that match, and\1references the first match and will only match if that letter was repeated.(?:...)+asks for repeats of the same two characters.re.sub()` replaces all those patterns with empty text.
The regex approach is good enough to pass that Hackerrank challenge.
You can use stack in order to achieve O(n) time complexity. Iterate over the characters in a string and for each character check if the top of stack contains the same character. In case it does pop the character from stack and move to next item. Otherwise push the character to the stack. Whatever remains in the stack is the result:
s = 'aaabccddd'
stack = []
for c in s:
if stack and stack[-1] == c:
stack.pop()
else:
stack.append(c)
print ''.join(stack) if stack else 'Empty String' # abd
Update Based on the discussion I ran couple of tests to measure the speed of regex and stack based solutions with input length of 100. Tests were run on Python 2.7 on Windows 8:
All same
Regex: 0.0563033799756
Stack: 0.267807865445
Nothing to remove
Regex: 0.075074750044
Stack: 0.183467329017
Worst case
Regex: 1.9983200193
Stack: 0.196362265609
Alphabet
Regex: 0.0759905517997
Stack: 0.182778728207
Code used for benchmarking:
import re
import timeit
def reduce_regexp(text):
even = re.compile(r'(?:([a-z])\1)+')
prev = len(text) + 1
while len(text) < prev:
prev = len(text)
text = even.sub(r'', text)
return text
def reduce_stack(s):
stack = []
for c in s:
if stack and stack[-1] == c:
stack.pop()
else:
stack.append(c)
return ''.join(stack)
CASES = [
['All same', 'a' * 100],
['Nothing to remove', 'ab' * 50],
['Worst case', 'ab' * 25 + 'ba' * 25],
['Alphabet', ''.join([chr(ord('a') + i) for i in range(25)] * 4)]
]
for name, case in CASES:
print(name)
res = timeit.timeit('reduce_regexp(case)',
setup='from __main__ import reduce_regexp, case; import re',
number=10000)
print('Regex: {}'.format(res))
res = timeit.timeit('reduce_stack(case)',
setup='from __main__ import reduce_stack, case',
number=10000)
print('Stack: {}'.format(res))

Find common substring between two strings

I'd like to compare 2 strings and keep the matched, splitting off where the comparison fails.
So if I have 2 strings:
string1 = "apples"
string2 = "appleses"
answer = "apples"
Another example, as the string could have more than one word:
string1 = "apple pie available"
string2 = "apple pies"
answer = "apple pie"
I'm sure there is a simple Python way of doing this but I can't work it out, any help and explanation appreciated.
For completeness, difflib in the standard-library provides loads of sequence-comparison utilities. For instance find_longest_match which finds the longest common substring when used on strings. Example use:
from difflib import SequenceMatcher
string1 = "apple pie available"
string2 = "come have some apple pies"
match = SequenceMatcher(None, string1, string2).find_longest_match()
print(match) # -> Match(a=0, b=15, size=9)
print(string1[match.a:match.a + match.size]) # -> apple pie
print(string2[match.b:match.b + match.size]) # -> apple pie
If you're using a version older than 3.9, you'need to call find_longest_match() with the following arguments:
SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
One might also consider os.path.commonprefix that works on characters and thus can be used for any strings.
import os
common = os.path.commonprefix(['apple pie available', 'apple pies'])
assert common == 'apple pie'
As the function name indicates, this only considers the common prefix of two strings.
def common_start(sa, sb):
""" returns the longest common substring from the beginning of sa and sb """
def _iter():
for a, b in zip(sa, sb):
if a == b:
yield a
else:
return
return ''.join(_iter())
>>> common_start("apple pie available", "apple pies")
'apple pie'
Or a slightly stranger way:
def stop_iter():
"""An easy way to break out of a generator"""
raise StopIteration
def common_start(sa, sb):
return ''.join(a if a == b else stop_iter() for a, b in zip(sa, sb))
Which might be more readable as
def terminating(cond):
"""An easy way to break out of a generator"""
if cond:
return True
raise StopIteration
def common_start(sa, sb):
return ''.join(a for a, b in zip(sa, sb) if terminating(a == b))
Its called Longest Common Substring problem. Here I present a simple, easy to understand but inefficient solution. It will take a long time to produce correct output for large strings, as the complexity of this algorithm is O(N^2).
def longestSubstringFinder(string1, string2):
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
match = ""
for j in range(len2):
if (i + j < len1 and string1[i + j] == string2[j]):
match += string2[j]
else:
if (len(match) > len(answer)): answer = match
match = ""
return answer
print(longestSubstringFinder("apple pie available", "apple pies"))
print(longestSubstringFinder("apples", "appleses"))
print(longestSubstringFinder("bapples", "cappleses"))
Output
apple pie
apples
apples
Fix bugs with the first's answer:
def longestSubstringFinder(string1, string2):
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp = 0
match = ''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp += 1
if len(match) > len(answer):
answer = match
return answer
print(longestSubstringFinder("dd apple pie available", "apple pies"))
print(longestSubstringFinder("cov_basic_as_cov_x_gt_y_rna_genes_w1000000", "cov_rna15pcs_as_cov_x_gt_y_rna_genes_w1000000")
print(longestSubstringFinder("bapples", "cappleses"))
print(longestSubstringFinder("apples", "apples"))
The same as Evo's, but with arbitrary number of strings to compare:
def common_start(*strings):
""" Returns the longest common substring
from the beginning of the `strings`
"""
def _iter():
for z in zip(*strings):
if z.count(z[0]) == len(z): # check all elements in `z` are the same
yield z[0]
else:
return
return ''.join(_iter())
The fastest way I've found is to use suffix_trees package:
from suffix_trees import STree
a = ["xxxabcxxx", "adsaabc"]
st = STree.STree(a)
print(st.lcs()) # "abc"
This script requests you the minimum common substring length and gives all common substrings in two strings. Also, it eliminates shorter substrings that longer substrings include already.
def common_substrings(str1,str2):
len1,len2=len(str1),len(str2)
if len1 > len2:
str1,str2=str2,str1
len1,len2=len2,len1
#short string=str1 and long string=str2
min_com = int(input('Please enter the minumum common substring length:'))
cs_array=[]
for i in range(len1,min_com-1,-1):
for k in range(len1-i+1):
if (str1[k:i+k] in str2):
flag=1
for m in range(len(cs_array)):
if str1[k:i+k] in cs_array[m]:
#print(str1[k:i+k])
flag=0
break
if flag==1:
cs_array.append(str1[k:i+k])
if len(cs_array):
print(cs_array)
else:
print('There is no any common substring according to the parametres given')
common_substrings('ciguliuana','ciguana')
common_substrings('apples','appleses')
common_substrings('apple pie available','apple pies')
Try:
import itertools as it
''.join(el[0] for el in it.takewhile(lambda t: t[0] == t[1], zip(string1, string2)))
It does the comparison from the beginning of both strings.
def matchingString(x,y):
match=''
for i in range(0,len(x)):
for j in range(0,len(y)):
k=1
# now applying while condition untill we find a substring match and length of substring is less than length of x and y
while (i+k <= len(x) and j+k <= len(y) and x[i:i+k]==y[j:j+k]):
if len(match) <= len(x[i:i+k]):
match = x[i:i+k]
k=k+1
return match
print matchingString('apple','ale') #le
print matchingString('apple pie available','apple pies') #apple pie
A Trie data structure would work the best, better than DP.
Here is the code.
class TrieNode:
def __init__(self):
self.child = [None]*26
self.endWord = False
class Trie:
def __init__(self):
self.root = self.getNewNode()
def getNewNode(self):
return TrieNode()
def insert(self,value):
root = self.root
for i,character in enumerate(value):
index = ord(character) - ord('a')
if not root.child[index]:
root.child[index] = self.getNewNode()
root = root.child[index]
root.endWord = True
def search(self,value):
root = self.root
for i,character in enumerate(value):
index = ord(character) - ord('a')
if not root.child[index]:
return False
root = root.child[index]
return root.endWord
def main():
# Input keys (use only 'a' through 'z' and lower case)
keys = ["the","anaswe"]
output = ["Not present in trie",
"Present in trie"]
# Trie object
t = Trie()
# Construct trie
for key in keys:
t.insert(key)
# Search for different keys
print("{} ---- {}".format("the",output[t.search("the")]))
print("{} ---- {}".format("these",output[t.search("these")]))
print("{} ---- {}".format("their",output[t.search("their")]))
print("{} ---- {}".format("thaw",output[t.search("thaw")]))
if __name__ == '__main__':
main()
Let me know in case of doubts.
In case we have a list of words that we need to find all common substrings I check some of the codes above and the best was https://stackoverflow.com/a/42882629/8520109 but it has some bugs for example 'histhome' and 'homehist'. In this case, we should have 'hist' and 'home' as a result. Furthermore, it differs if the order of arguments is changed. So I change the code to find every block of substring and it results a set of common substrings:
main = input().split(" ") #a string of words separated by space
def longestSubstringFinder(string1, string2):
'''Find the longest matching word'''
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp=0
match=''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp+=1
if (len(match) > len(answer)):
answer = match
return answer
def listCheck(main):
'''control the input for finding substring in a list of words'''
string1 = main[0]
result = []
for i in range(1, len(main)):
string2 = main[i]
res1 = longestSubstringFinder(string1, string2)
res2 = longestSubstringFinder(string2, string1)
result.append(res1)
result.append(res2)
result.sort()
return result
first_answer = listCheck(main)
final_answer = []
for item1 in first_answer: #to remove some incorrect match
string1 = item1
double_check = True
for item2 in main:
string2 = item2
if longestSubstringFinder(string1, string2) != string1:
double_check = False
if double_check:
final_answer.append(string1)
print(set(final_answer))
main = 'ABACDAQ BACDAQA ACDAQAW XYZCDAQ' #>>> {'CDAQ'}
main = 'homehist histhome' #>>> {'hist', 'home'}
def LongestSubString(s1,s2):
if len(s1)<len(s2) :
s1,s2 = s2,s1
maxsub =''
for i in range(len(s2)):
for j in range(len(s2),i,-1):
if s2[i:j] in s1 and j-i>len(maxsub):
return s2[i:j]
Returns the first longest common substring:
def compareTwoStrings(string1, string2):
list1 = list(string1)
list2 = list(string2)
match = []
output = ""
length = 0
for i in range(0, len(list1)):
if list1[i] in list2:
match.append(list1[i])
for j in range(i + 1, len(list1)):
if ''.join(list1[i:j]) in string2:
match.append(''.join(list1[i:j]))
else:
continue
else:
continue
for string in match:
if length < len(list(string)):
length = len(list(string))
output = string
else:
continue
return output
**Return the comman longest substring**
def longestSubString(str1, str2):
longestString = ""
maxLength = 0
for i in range(0, len(str1)):
if str1[i] in str2:
for j in range(i + 1, len(str1)):
if str1[i:j] in str2:
if(len(str1[i:j]) > maxLength):
maxLength = len(str1[i:j])
longestString = str1[i:j]
return longestString
This is the classroom problem called 'Longest sequence finder'. I have given some simple code that worked for me, also my inputs are lists of a sequence which can also be a string:
def longest_substring(list1,list2):
both=[]
if len(list1)>len(list2):
small=list2
big=list1
else:
small=list1
big=list2
removes=0
stop=0
for i in small:
for j in big:
if i!=j:
removes+=1
if stop==1:
break
elif i==j:
both.append(i)
for q in range(removes+1):
big.pop(0)
stop=1
break
removes=0
return both
As if this question doesn't have enough answers, here's another option:
from collections import defaultdict
def LongestCommonSubstring(string1, string2):
match = ""
matches = defaultdict(list)
str1, str2 = sorted([string1, string2], key=lambda x: len(x))
for i in range(len(str1)):
for k in range(i, len(str1)):
cur = match + str1[k]
if cur in str2:
match = cur
else:
match = ""
if match:
matches[len(match)].append(match)
if not matches:
return ""
longest_match = max(matches.keys())
return matches[longest_match][0]
Some example cases:
LongestCommonSubstring("whose car?", "this is my car")
> ' car'
LongestCommonSubstring("apple pies", "apple? forget apple pie!")
> 'apple pie'
This isn't the most efficient way to do it but it's what I could come up with and it works. If anyone can improve it, please do. What it does is it makes a matrix and puts 1 where the characters match. Then it scans the matrix to find the longest diagonal of 1s, keeping track of where it starts and ends. Then it returns the substring of the input string with the start and end positions as arguments.
Note: This only finds one longest common substring. If there's more than one, you could make an array to store the results in and return that Also, it's case sensitive so (Apple pie, apple pie) will return pple pie.
def longestSubstringFinder(str1, str2):
answer = ""
if len(str1) == len(str2):
if str1==str2:
return str1
else:
longer=str1
shorter=str2
elif (len(str1) == 0 or len(str2) == 0):
return ""
elif len(str1)>len(str2):
longer=str1
shorter=str2
else:
longer=str2
shorter=str1
matrix = numpy.zeros((len(shorter), len(longer)))
for i in range(len(shorter)):
for j in range(len(longer)):
if shorter[i]== longer[j]:
matrix[i][j]=1
longest=0
start=[-1,-1]
end=[-1,-1]
for i in range(len(shorter)-1, -1, -1):
for j in range(len(longer)):
count=0
begin = [i,j]
while matrix[i][j]==1:
finish=[i,j]
count=count+1
if j==len(longer)-1 or i==len(shorter)-1:
break
else:
j=j+1
i=i+1
i = i-count
if count>longest:
longest=count
start=begin
end=finish
break
answer=shorter[int(start[0]): int(end[0])+1]
return answer
First a helper function adapted from the itertools pairwise recipe to produce substrings.
import itertools
def n_wise(iterable, n = 2):
'''n = 2 -> (s0,s1), (s1,s2), (s2, s3), ...
n = 3 -> (s0,s1, s2), (s1,s2, s3), (s2, s3, s4), ...'''
a = itertools.tee(iterable, n)
for x, thing in enumerate(a[1:]):
for _ in range(x+1):
next(thing, None)
return zip(*a)
Then a function the iterates over substrings, longest first, and tests for membership. (efficiency not considered)
def foo(s1, s2):
'''Finds the longest matching substring
'''
# the longest matching substring can only be as long as the shortest string
#which string is shortest?
shortest, longest = sorted([s1, s2], key = len)
#iterate over substrings, longest substrings first
for n in range(len(shortest)+1, 2, -1):
for sub in n_wise(shortest, n):
sub = ''.join(sub)
if sub in longest:
#return the first one found, it should be the longest
return sub
s = "fdomainster"
t = "exdomainid"
print(foo(s,t))
>>>
domain
>>>
def LongestSubString(s1,s2):
left = 0
right =len(s2)
while(left<right):
if(s2[left] not in s1):
left = left+1
else:
if(s2[left:right] not in s1):
right = right - 1
else:
return(s2[left:right])
s1 = "pineapple"
s2 = "applc"
print(LongestSubString(s1,s2))

How to find all occurrences of a substring?

Python has string.find() and string.rfind() to get the index of a substring in a string.
I'm wondering whether there is something like string.find_all() which can return all found indexes (not only the first from the beginning or the first from the end).
For example:
string = "test test test test"
print string.find('test') # 0
print string.rfind('test') # 15
#this is the goal
print string.find_all('test') # [0,5,10,15]
For counting the occurrences, see Count number of occurrences of a substring in a string.
There is no simple built-in string function that does what you're looking for, but you could use the more powerful regular expressions:
import re
[m.start() for m in re.finditer('test', 'test test test test')]
#[0, 5, 10, 15]
If you want to find overlapping matches, lookahead will do that:
[m.start() for m in re.finditer('(?=tt)', 'ttt')]
#[0, 1]
If you want a reverse find-all without overlaps, you can combine positive and negative lookahead into an expression like this:
search = 'tt'
[m.start() for m in re.finditer('(?=%s)(?!.{1,%d}%s)' % (search, len(search)-1, search), 'ttt')]
#[1]
re.finditer returns a generator, so you could change the [] in the above to () to get a generator instead of a list which will be more efficient if you're only iterating through the results once.
>>> help(str.find)
Help on method_descriptor:
find(...)
S.find(sub [,start [,end]]) -> int
Thus, we can build it ourselves:
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += len(sub) # use start += 1 to find overlapping matches
list(find_all('spam spam spam spam', 'spam')) # [0, 5, 10, 15]
No temporary strings or regexes required.
Here's a (very inefficient) way to get all (i.e. even overlapping) matches:
>>> string = "test test test test"
>>> [i for i in range(len(string)) if string.startswith('test', i)]
[0, 5, 10, 15]
Use re.finditer:
import re
sentence = input("Give me a sentence ")
word = input("What word would you like to find ")
for match in re.finditer(word, sentence):
print (match.start(), match.end())
For word = "this" and sentence = "this is a sentence this this" this will yield the output:
(0, 4)
(19, 23)
(24, 28)
Again, old thread, but here's my solution using a generator and plain str.find.
def findall(p, s):
'''Yields all the positions of
the pattern p in the string s.'''
i = s.find(p)
while i != -1:
yield i
i = s.find(p, i+1)
Example
x = 'banananassantana'
[(i, x[i:i+2]) for i in findall('na', x)]
returns
[(2, 'na'), (4, 'na'), (6, 'na'), (14, 'na')]
You can use re.finditer() for non-overlapping matches.
>>> import re
>>> aString = 'this is a string where the substring "is" is repeated several times'
>>> print [(a.start(), a.end()) for a in list(re.finditer('is', aString))]
[(2, 4), (5, 7), (38, 40), (42, 44)]
but won't work for:
In [1]: aString="ababa"
In [2]: print [(a.start(), a.end()) for a in list(re.finditer('aba', aString))]
Output: [(0, 3)]
Come, let us recurse together.
def locations_of_substring(string, substring):
"""Return a list of locations of a substring."""
substring_length = len(substring)
def recurse(locations_found, start):
location = string.find(substring, start)
if location != -1:
return recurse(locations_found + [location], location+substring_length)
else:
return locations_found
return recurse([], 0)
print(locations_of_substring('this is a test for finding this and this', 'this'))
# prints [0, 27, 36]
No need for regular expressions this way.
If you're just looking for a single character, this would work:
string = "dooobiedoobiedoobie"
match = 'o'
reduce(lambda count, char: count + 1 if char == match else count, string, 0)
# produces 7
Also,
string = "test test test test"
match = "test"
len(string.split(match)) - 1
# produces 4
My hunch is that neither of these (especially #2) is terribly performant.
this is an old thread but i got interested and wanted to share my solution.
def find_all(a_string, sub):
result = []
k = 0
while k < len(a_string):
k = a_string.find(sub, k)
if k == -1:
return result
else:
result.append(k)
k += 1 #change to k += len(sub) to not search overlapping results
return result
It should return a list of positions where the substring was found.
Please comment if you see an error or room for improvment.
This does the trick for me using re.finditer
import re
text = 'This is sample text to test if this pythonic '\
'program can serve as an indexing platform for '\
'finding words in a paragraph. It can give '\
'values as to where the word is located with the '\
'different examples as stated'
# find all occurances of the word 'as' in the above text
find_the_word = re.finditer('as', text)
for match in find_the_word:
print('start {}, end {}, search string \'{}\''.
format(match.start(), match.end(), match.group()))
This thread is a little old but this worked for me:
numberString = "onetwothreefourfivesixseveneightninefiveten"
testString = "five"
marker = 0
while marker < len(numberString):
try:
print(numberString.index("five",marker))
marker = numberString.index("five", marker) + 1
except ValueError:
print("String not found")
marker = len(numberString)
You can try :
>>> string = "test test test test"
>>> for index,value in enumerate(string):
if string[index:index+(len("test"))] == "test":
print index
0
5
10
15
You can try :
import re
str1 = "This dress looks good; you have good taste in clothes."
substr = "good"
result = [_.start() for _ in re.finditer(substr, str1)]
# result = [17, 32]
When looking for a large amount of key words in a document, use flashtext
from flashtext import KeywordProcessor
words = ['test', 'exam', 'quiz']
txt = 'this is a test'
kwp = KeywordProcessor()
kwp.add_keywords_from_list(words)
result = kwp.extract_keywords(txt, span_info=True)
Flashtext runs faster than regex on large list of search words.
This function does not look at all positions inside the string, it does not waste compute resources. My try:
def findAll(string,word):
all_positions=[]
next_pos=-1
while True:
next_pos=string.find(word,next_pos+1)
if(next_pos<0):
break
all_positions.append(next_pos)
return all_positions
to use it call it like this:
result=findAll('this word is a big word man how many words are there?','word')
src = input() # we will find substring in this string
sub = input() # substring
res = []
pos = src.find(sub)
while pos != -1:
res.append(pos)
pos = src.find(sub, pos + 1)
Whatever the solutions provided by others are completely based on the available method find() or any available methods.
What is the core basic algorithm to find all the occurrences of a
substring in a string?
def find_all(string,substring):
"""
Function: Returning all the index of substring in a string
Arguments: String and the search string
Return:Returning a list
"""
length = len(substring)
c=0
indexes = []
while c < len(string):
if string[c:c+length] == substring:
indexes.append(c)
c=c+1
return indexes
You can also inherit str class to new class and can use this function
below.
class newstr(str):
def find_all(string,substring):
"""
Function: Returning all the index of substring in a string
Arguments: String and the search string
Return:Returning a list
"""
length = len(substring)
c=0
indexes = []
while c < len(string):
if string[c:c+length] == substring:
indexes.append(c)
c=c+1
return indexes
Calling the method
newstr.find_all('Do you find this answer helpful? then upvote
this!','this')
This is solution of a similar question from hackerrank. I hope this could help you.
import re
a = input()
b = input()
if b not in a:
print((-1,-1))
else:
#create two list as
start_indc = [m.start() for m in re.finditer('(?=' + b + ')', a)]
for i in range(len(start_indc)):
print((start_indc[i], start_indc[i]+len(b)-1))
Output:
aaadaa
aa
(0, 1)
(1, 2)
(4, 5)
Here's a solution that I came up with, using assignment expression (new feature since Python 3.8):
string = "test test test test"
phrase = "test"
start = -1
result = [(start := string.find(phrase, start + 1)) for _ in range(string.count(phrase))]
Output:
[0, 5, 10, 15]
I think the most clean way of solution is without libraries and yields:
def find_all_occurrences(string, sub):
index_of_occurrences = []
current_index = 0
while True:
current_index = string.find(sub, current_index)
if current_index == -1:
return index_of_occurrences
else:
index_of_occurrences.append(current_index)
current_index += len(sub)
find_all_occurrences(string, substr)
Note: find() method returns -1 when it can't find anything
The pythonic way would be:
mystring = 'Hello World, this should work!'
find_all = lambda c,s: [x for x in range(c.find(s), len(c)) if c[x] == s]
# s represents the search string
# c represents the character string
find_all(mystring,'o') # will return all positions of 'o'
[4, 7, 20, 26]
>>>
if you only want to use numpy here is a solution
import numpy as np
S= "test test test test"
S2 = 'test'
inds = np.cumsum([len(k)+len(S2) for k in S.split(S2)[:-1]])- len(S2)
print(inds)
if you want to use without re(regex) then:
find_all = lambda _str,_w : [ i for i in range(len(_str)) if _str.startswith(_w,i) ]
string = "test test test test"
print( find_all(string, 'test') ) # >>> [0, 5, 10, 15]
please look at below code
#!/usr/bin/env python
# coding:utf-8
'''黄哥Python'''
def get_substring_indices(text, s):
result = [i for i in range(len(text)) if text.startswith(s, i)]
return result
if __name__ == '__main__':
text = "How much wood would a wood chuck chuck if a wood chuck could chuck wood?"
s = 'wood'
print get_substring_indices(text, s)
def find_index(string, let):
enumerated = [place for place, letter in enumerate(string) if letter == let]
return enumerated
for example :
find_index("hey doode find d", "d")
returns:
[4, 7, 13, 15]
Not exactly what OP asked but you could also use the split function to get a list of where all the substrings don't occur. OP didn't specify the end goal of the code but if your goal is to remove the substrings anyways then this could be a simple one-liner. There are probably more efficient ways to do this with larger strings; regular expressions would be preferable in that case
# Extract all non-substrings
s = "an-example-string"
s_no_dash = s.split('-')
# >>> s_no_dash
# ['an', 'example', 'string']
# Or extract and join them into a sentence
s_no_dash2 = ' '.join(s.split('-'))
# >>> s_no_dash2
# 'an example string'
Did a brief skim of other answers so apologies if this is already up there.
def count_substring(string, sub_string):
c=0
for i in range(0,len(string)-2):
if string[i:i+len(sub_string)] == sub_string:
c+=1
return c
if __name__ == '__main__':
string = input().strip()
sub_string = input().strip()
count = count_substring(string, sub_string)
print(count)
I runned in the same problem and did this:
hw = 'Hello oh World!'
list_hw = list(hw)
o_in_hw = []
while True:
o = hw.find('o')
if o != -1:
o_in_hw.append(o)
list_hw[o] = ' '
hw = ''.join(list_hw)
else:
print(o_in_hw)
break
Im pretty new at coding so you can probably simplify it (and if planned to used continuously of course make it a function).
All and all it works as intended for what i was doing.
Edit: Please consider this is for single characters only, and it will change your variable, so you have to create a copy of the string in a new variable to save it, i didnt put it in the code cause its easy and its only to show how i made it work.
By slicing we find all the combinations possible and append them in a list and find the number of times it occurs using count function
s=input()
n=len(s)
l=[]
f=input()
print(s[0])
for i in range(0,n):
for j in range(1,n+1):
l.append(s[i:j])
if f in l:
print(l.count(f))
To find all the occurence of a character in a give string and return as a dictionary
eg: hello
result :
{'h':1, 'e':1, 'l':2, 'o':1}
def count(string):
result = {}
if(string):
for i in string:
result[i] = string.count(i)
return result
return {}
or else you do like this
from collections import Counter
def count(string):
return Counter(string)

Categories