Anagram check in Python - python

I am trying to write a program that will compare two lists of words and check the words to see if they are anagrams.
eg.,
input : ['cinema','host','aab','train'], ['iceman', 'shot', 'bab', 'rain']
I am using the below code:
#!/usr/bin/env python
anagram_dict = {}
def anagram_solver(first_words,second_words):
for word in first_words:
first_word = list(word)
second_word = list(second_words[first_words.index(word)])
first_copy = first_word
second_copy = second-word
if len(first_word) != len(second_word):
anagram_dict[first_words.index(word)] = 0
else:
for char in first_word:
second_word = second_copy
if char in second_word:
first_copy.remove(char)
second_copy.remove(char)
else:
pass
if len(first_copy) == len(second_copy):
print first_copy
print second_copy
anagram_dict[first_words.index(word)] = 1
else:
anagram_dict[first_words.index(word)] = 0
for k,v in anagram_dict.items():
print "%d : %d" %(k,v)
if __name__ == "__main__":
anagram_solver(['cinema','host','aab','train'],['iceman','shot','bab','rain'])
When I execute this script, in the for loop for char in first_word: the loop is skipped, by one list item. for example, if it is processing the list ['c','i','n','e','m','a']
it only processes 'c','n','m' and ignores the other items. If I remove the list.remove(), then it doesn't skip the items.
One can execute this script to better understand, what I am trying to explain here.
Just wondering why is this behavior and how to overcome this ?

You can simply sort the words and check if they are equal:
def anagram_solver(first_words, second_words):
result = []
for i in xrange(len(first_words)):
a = list(first_words[i])
b = list(second_words[i])
a.sort()
b.sort()
result.append(a == b)
return result
Example:
>>> a = ['cinema','host','aab','train']
>>> b = ['iceman', 'shot', 'bab', 'rain']
>>> anagram_solver(a, b)
[True, True, False, False]

Python handles lists by reference, so when you set first_copy = first_word, you're actually just making first_copy and first_word point to the same list. You can overcome this behavior (actually copy the list) using
first_copy = first_word[:]
second_copy = second_word[:]

To answer to your question according to its title: "Anagram check in Python"
You can do that in one three lines:
first_words = ['cinema','host','aab','train']
second_words = ['iceman', 'shot', 'bab', 'rain']
print [sorted(a) == sorted(b) for (a,b) in zip(first_words,second_words)]
Producing:
[True, True, False, False]

You can use enumerate with sorted:
[sorted(a[ind]) == sorted(ele) for ind, ele in enumerate(b)]

There are two ways to do this. One is pretty easy and other one is a bit complicated but is Optimal.
First Method
def anagram1(s1,s2):
# We need to get rid of the empty spaces and
# lower case the string
s1 = s1.replace(' ', '').lower()
s2 = s2.replace(' ', '').lower()
# Now we will return boolean for sorted match.
return sorted(s1) == sorted(s2)
The next Method is bit longer:
def anagram2(s1, s2):
# We will remove spaces and will lower case the string
s1 = s1.replace(' ', '').lower()
s2 = s2.replace(' ', '').lower()
# We will do the edge case to check if both strings have same number of letters
if len(s1) != len(s2):
return False
# will creat an empty dictionary.
count = {}
for letter in s1:
if letter in count:
# We are assigning value 1 for every letter in s1
count[letter] += 1
# if it is the start of loop u just want to assign one into it.
else:
count[letter] = 1
for s2 we will do the opposite.
for letter in s2:
if letter in count:
# We are making every value of the letters from 1 to zero
count[letter] -= 1
else:
count[letter] = 1
for k in count:
if count[k] != 0:
return False
# other wise just return true
return True

def anagram(string_one, string_two):
string_one = string_one.replace(' ', '').lower()
string_two = string_two.replace(' ', '').lower()
string_list_one = []
string_list_two = []
for letters in string_one:
string_list_one.append(letters)
for letters_t in string_two:
string_list_two.append(letters_t)
string_list_one.sort()
string_list_two.sort()
if(string_list_one == string_list_two):
return True
else:
return False

Related

Why does my while loop calculate incorrect value of the string?

I am trying to find greatest length of a word from the string return it by using values of each letter from alphabets by assigning each letter it's value as per it's rank . So for example For a string s = 'abcd a', I intend to return 10 [a=1 + b=2 + c =3 + d=4] .But, I am getting output as 7 When I debugged the code, I noticed that in while loop my code skips i=2 and directly jumps on i=3. Where am I going wrong? Below is my code.
class Solution(object):
def highest_scoring_word(self,s):
# Dictionary of English letters
dt = {'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,
'g':7,'h':8,'i':9,'j':10,'k':11,'l':12,
'm':13,'n':14,'o':15,'p':16,'q':17,
'r':18,'s':19,'t':20,'u':21,'v':22,
'w':23,'x':24,'y':25,'z':26}
value_sum =0
max_value =value_sum
for i in range(0,len(s)):
if s.upper():
s= s.lower()
words = s.split()
# convert the string in char array
to_char_array = list(words[i])
j=0
while j<len(to_char_array):
if to_char_array[j] in dt.keys() :
value_sum = max(dt.get(to_char_array[j]),value_sum + dt.get(to_char_array[j]))
max_value = max(value_sum,max_value)
else:
pass
j +=j+1
return max_value
if __name__ == '__main__':
p = 'abcd a'
print(Solution().highest_scoring_word(p))
`
I have created a dictionary where I have stored all letters in english alphabet and their values and later I have split the string into words using split() and then after converting each individual word into character array I have traversed it to find their occurrence in the dictionary and add to the final value. I am expecting to get a correct value of a string and finally the greatest value.
As you are using a class and methods, make use of them:
from string import ascii_lowercase as dt
class Solution(object):
def __init__(self, data):
self.scores = {}
self.words = data.lower().strip().split()
def get_scoring(self):
# for each word caculate the scoring
for word in self.words:
score = 0
# for each character in the word, find its index in 'a..z' and add it to score
# same as in your dt implementation (just using index not absolute values)
for c in word:
score += dt.find(c) + 1
self.scores[word] = score
print(self.scores)
# filer the dictionary by its greates value in order to get the word with max score:
return max(self.scores.keys(), key=lambda k: self.scores[k])
if __name__ == '__main__':
p = 'abcd fg11'
maxWord = Solution(p).get_scoring()
print(maxWord)
Out:
{'abcd': 10, 'fg11': 13}
fg11
Try using this:
class Solution(object):
def highest_scoring_word(self,s):
# Dictionary of English letters
dt = {'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,
'g':7,'h':8,'i':9,'j':10,'k':11,'l':12,
'm':13,'n':14,'o':15,'p':16,'q':17,
'r':18,'s':19,'t':20,'u':21,'v':22,
'w':23,'x':24,'y':25,'z':26}
value_sum1 =0
max_value1 =value_sum1
value_sum2 =0
max_value2 =value_sum2
for i in range(0,len(s)):
if s.upper():
s= s.lower()
words = s.split()
if len(words)>1:
# convert the string in char array
to_char_array = list(words[0])
j=0
while j<len(to_char_array):
if to_char_array[j] in dt.keys() :
value_sum1 = max(dt.get(to_char_array[j]),value_sum1 + dt.get(to_char_array[j]))
max_value1 = max(value_sum1,max_value1)
else:
pass
j=j+1
to_char_array = list(words[1])
j=0
while j<len(to_char_array):
if to_char_array[j] in dt.keys() :
value_sum2 = max(dt.get(to_char_array[j]),value_sum2 + dt.get(to_char_array[j]))
max_value2 = max(value_sum2,max_value2)
else:
pass
j=j+1
if max_value2>max_value1:
return max_value2
elif max_value1>max_value2:
return max_value1
else:
return 'Both words have equal score'
else:
# convert the string in char array
to_char_array = list(words[i])
j=0
while j<len(to_char_array):
if to_char_array[j] in dt.keys() :
value_sum1 = max(dt.get(to_char_array[j]),value_sum1 + dt.get(to_char_array[j]))
max_value1 = max(value_sum1,max_value1)
else:
pass
j=j+1
return max_value1
if __name__ == '__main__':
p = 'abcd fg'
print(Solution().highest_scoring_word(p))
It is maybe of interest that the code can be greatly simplified by using features available in Python:
the_sum = sum(ord(c)-96 for c in s.lower() if c.isalpha())
to break this down. for c in s.lower() gets the lower-case characters one by one; the function ord() gives the numerical value with a of 97 so we subtract to get 1. Then we check if the character is a letter and if so accept it. Then sum() adds up all the numbers. You could break up this one line an check how the separate parts work.

Python taking too long to exectute simple code... might have entered an infinite loop

I've tried changing variables in case I made a scope error, etc., but nothing seems to work.
I've defined multiple functions for finding frequency of words that appear in a string. It evaluates till two functions but the last function always enters infinite loop... except when there is no repetition.
def freq_finder(k):
dict = {}
k = k.split(' ')
for word in k:
if word in dict:
dict[word] += 1
else:
dict[word] = 1
return dict
def freq_max(l):
to_ = freq_finder(l)
values = to_.values()
best = max(values)
words = []
for t in to_:
if to_[t] == best:
words.append(t)
return (words, best)
def freq_maxi(h):
values = h.values()
best = max(values)
words = []
for t in h:
if h == best:
words.append(t)
return (words, best)
def words_above_freq(r, freq):
result = []
temp_faltu = freq_finder(r)
done = False
while not done:
temp = freq_maxi(temp_faltu)
if temp[1] >= freq: # temp[1] is 'best' that was a return from freq_max
result.append(temp)
for w in temp[0]: # temp[0] is the 'words'
del(temp_faltu[w])
else:
done = True
return result
horde = "I was not was I not"
print(freq_finder(horde))
print(freq_max(horde))
print(words_above_freq(horde, 2))
The below function is returning an empty array for words...
def freq_max(l):
to_ = freq_finder(l)
values = to_.values()
best = max(values)
words = []
for t in to_:
if to_[t] == best:
words.append(t)
return (words, best)
This is in turn causing an infinite loop as a condition never gets met for the next function. You actually have many issues in this code and it is way overcomplicated for what you are doing. For example, do not delete elements in an array that is being looped through. This is very nasty.
Finally, look into a module called collections that can do this and more...
https://pymotw.com/2/collections/counter.html
Or more simply loop through this to find a counter...
horde = "I was not was I not"
substring = "I"
count = horde.count(substring)
# print count
print("The count is:", count)
>>> def freq_finder(k):
... d = {}
... k = k.split(' ')
... for word in k:
... if word in d:
... d[word] += 1
... else:
... d[word] = 1
... return d
...
>>> horde = "I was not was I not"
>>> print(freq_finder(horde))
{'I': 2, 'was': 2, 'not': 2}
the problem is that you're overwriting dict which is a python class for dictionaries.
Using another name (for example d) fix te problem.
>>> def freq_max(l):
... to_ = freq_finder(l)
... values = to_.values()
... best = max(values)
... words = []
... for t in to_:
... if to_[t] == best:
... words.append(t)
... return (words, best)
...
>>> print(freq_max(horde))
(['I', 'was', 'not'], 2)
this seems ok, I don't think you had an infinite loop, this cannot happen because freq_finder cannot return an infinite dictionary.
Hint:
add breakpoint() in the code where you want to understand things and use
!<variable name> to print variables
n to go ahead by 1 line
l to print where you're in the code
c to run till the end
exit to exit
for example create a file called freq_utils.py and put the code inside:
def freq_finder(k):
breakpoint()
d = {}
k = k.split(' ')
for word in k:
if word in d:
d[word] += 1
else:
d[word] = 1
return d
horde = "I was not was I not"
print(freq_finder(horde))
$ python3 /tmp/freq_utils.py
> /tmp/freq_utils.py(4)freq_finder()
-> d = {}
(Pdb) !d
*** NameError: name 'd' is not defined
(Pdb) n
> /tmp/freq_utils.py(5)freq_finder()
-> k = k.split(' ')
(Pdb) !d
{}
(Pdb) n
> /tmp/freq_utils.py(6)freq_finder()
-> for word in k:
(Pdb) n
> /tmp/freq_utils.py(7)freq_finder()
-> if word in d:
(Pdb) n
> /tmp/freq_utils.py(10)freq_finder()
-> d[word] = 1
(Pdb) n
> /tmp/freq_utils.py(6)freq_finder()
-> for word in k:
(Pdb) !d
{'I': 1}
(Pdb) c
{'I': 2, 'was': 2, 'not': 2}

How to find the longest common substring between two strings using Python?

I want to write a Python code that computes the longest common substring between two strings from the input.
Example:
word1 = input('Give 1. word: xlaqseabcitt')
word2 = input('Give 2. word: peoritabcpeor')
Wanted output:
abc
I have code like this so far:
word1 = input("Give 1. word: ")
word2 = input("Give 2. word: ")
longestSegment = ""
tempSegment = ""
for i in range(len(word1)):
if word1[i] == word2[i]:
tempSegment += word1[i]
else:
tempSegment = ""
if len(tempSegment) > len(longestSegment):
longestSegment = tempSegment
print(longestSegment)
I end up with IndexError when word2 is shorter than word1, and it does not give me the common substring.
EDIT: I found this solution:
string1 = input('Give 1. word: ')
string2 = input('Give 2. word: ')
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp=0
match=''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp+=1
if (len(match) > len(answer)):
answer = match
print(answer)
However, I would like to see a library function call that could be used to compute the longest common substring between two strings.
Alternatively, please suggest a more concise code to achieve the same.
You can build a dictionary from the first string containing the positions of each character, keyed on the characters. Then go through the second string and compare the substring of each character with the rest of the second string at that position:
# extract common prefix
def common(A,B) :
firstDiff = (i for i,(a,b) in enumerate(zip(A,B)) if a!=b) # 1st difference
commonLen = next(firstDiff,min(len(A),len(B))) # common length
return A[:commonLen]
word1 = "xlaqseabcitt"
word2 = "peoritabcpeor"
# position(s) of each character in word1
sub1 = dict()
for i,c in enumerate(word1): sub1.setdefault(c,[]).append(i)
# maximum (by length) of common prefixes from matching first characters
maxSub = max((common(word2[i:],word1[j:])
for i,c in enumerate(word2)
for j in sub1.get(c,[])),key=len)
print(maxSub) # abc
For me, looks like the solution that works is using the suffix_trees package:
from suffix_trees import STree
a = ["xxx ABC xxx", "adsa abc"]
st = STree.STree(a)
print(st.lcs()) # "abc"
Here is an answer if you later want to compute any number of strings. It should return the longest common substring. It work with the different test i gave it. (as long as you don't use the '§' character)
It is not a library but you can still import the functions in your code just like a library. You can use the same logic with your own code (only for two strings.) Do so as follows (put both files in the same directory for the sake of simplicity). I am supposing you will call the file findmatch.py.
import findmatch
longest_substring = findmatch.prep(['list', 'of', 'strings'])
Here is the code that should be in 'findmatch.py'.
def main(words,first):
nextreference = first
reference = first
for word in words:
foundsub = False
print('reference : ',reference)
print('word : ', word)
num_of_substring = 0
length_longest_substring = 0
for i in range(len(word)):
print('nextreference : ', nextreference)
letter = word[i]
print('letter : ', letter)
if word[i] in reference:
foundsub = True
num_of_substring += 1
locals()['substring'+str(num_of_substring)] = word[i]
print('substring : ', locals()['substring'+str(num_of_substring)])
for j in range(len(reference)-i):
if word[i:i+j+1] in reference:
locals()['substring'+str(num_of_substring) ]= word[i:i+j+1]
print('long_sub : ',locals()['substring'+str(num_of_substring)])
print('new : ',len(locals()['substring'+str(num_of_substring)]))
print('next : ',len(nextreference))
print('ref : ', len(reference))
longer = (len(reference)<len(locals()['substring'+str(num_of_substring)]))
longer2 = (len(nextreference)<len(locals()['substring'+str(num_of_substring)]))
if (num_of_substring==1) or longer or longer2:
nextreference = locals()['substring'+str(num_of_substring)]
if not foundsub:
for i in range(len(words)):
words[i] = words[i].replace(reference, '§')
#§ should not be used in any of the strings, put a character you don't use here
print(words)
try:
nextreference = main(words, first)
except Exception as e:
return None
reference = nextreference
return reference
def prep(words):
first = words[0]
words.remove(first)
answer = main(words, first)
return answer
if __name__ == '__main__':
words = ['azerty','azertydqse','fghertqdfqf','ert','sazjjjjjjjjjjjert']
#just some absurd examples any word in here
substring = prep(words)
print('answer : ',substring)
It is basically creating your own library.
I hope this aswers helps someone.
Here is a recursive solution :
def lcs(X, Y, m, n):
if m == 0 or n == 0:
return 0
elif X[m - 1] == Y[n - 1]:
return 1 + lcs(X, Y, m - 1, n - 1);
else:
return max(lcs(X, Y, m, n - 1), lcs(X, Y, m - 1, n));
Since someone asked for a multiple-word solution, here's one:
def multi_lcs(words):
words.sort(key=lambda x:len(x))
search = words.pop(0)
s_len = len(search)
for ln in range(s_len, 0, -1):
for start in range(0, s_len-ln+1):
cand = search[start:start+ln]
for word in words:
if cand not in word:
break
else:
return cand
return False
>>> multi_lcs(['xlaqseabcitt', 'peoritabcpeor'])
'abc'
>>> multi_lcs(['xlaqseabcitt', 'peoritabcpeor', 'visontatlasrab'])
'ab'
for small strings, copy this into a file in your project, let's say string_utils.py
def find_longest_common_substring(string1, string2):
s1 = string1
s2 = string2
longest_substring = ""
longest_substring_i1 = None
longest_substring_i2 = None
# iterate through every index (i1) of s1
for i1, c1 in enumerate(s1):
# for each index (i2) of s2 that matches s1[i1]
for i2, c2 in enumerate(s2):
# if start of substring
if c1 == c2:
delta = 1
# make sure we aren't running past the end of either string
while i1 + delta < len(s1) and i2 + delta < len(s2):
# if end of substring
if s2[i2 + delta] != s1[i1 + delta]:
break
# still matching characters move to the next character in both strings
delta += 1
substring = s1[i1:(i1 + delta)]
# print(f'substring candidate: {substring}')
# replace longest_substring if newly found substring is longer
if len(substring) > len(longest_substring):
longest_substring = substring
longest_substring_i1 = i1
longest_substring_i2 = i2
return (longest_substring, longest_substring_i1, longest_substring_i2)
Then it can be used as follows:
import string_utils
print(f"""(longest substring, index of string1, index of string2):
{ string_utils.find_longest_common_substring("stackoverflow.com", "tackerflow")}""")
For any that are curious the print statement when uncommented prints:
substring candidate: tack
substring candidate: ack
substring candidate: ck
substring candidate: o
substring candidate: erflow
substring candidate: rflow
substring candidate: flow
substring candidate: low
substring candidate: ow
substring candidate: w
substring candidate: c
substring candidate: o
(longest substring, index of string1, index of string2):
('erflow', 7, 4)
Here is a naive solution in terms of time complexity but simple enough to understand:
def longest_common_substring(a, b):
"""Find longest common substring between two strings A and B."""
if len(a) > len(b):
a, b = b, a
for i in range(len(a), 0, -1):
for j in range(len(a) - i + 1):
if a[j:j + i] in b:
return a[j:j + i]
return ''
A super fast library is available for Python: pylcs
It can find the indices of the longest common substring (LCS) between 2 strings, and can do some other related tasks as well.
A function to return the LCS using this library consists of 2 lines:
import pylcs
def find_LCS(s1, s2):
res = pylcs.lcs_string_idx(s1, s2)
return ''.join([s2[i] for i in res if i != -1])
Example:
s1 = 'bbbaaabaa'
s2 = 'abaabaab'
print(find_LCS(s1, s2))
aabaa
Explanation:
In this example res is:
[-1, -1, -1, -1, 2, 3, 4, 5, 6]
It is a mapping of all characters in s1 - to the indices of characters in s2 of the LCS.
-1 indicates that the character of s1 is NOT part of the LCS.
The reasons behind the speed and efficiency of this library are that it's implemented in C++ and uses dynamic programming.

Executing an anagram checker algorithm in less than a second

I have created this algorithm to check whether two strings are anagram of each other.
In this exercise, I consider two strings to be anagram of each other if they have the same characters or if they differ just by one. For example math and amth are anagram but even math and maths are anagram.
I need to execute this algorithm in less than a second, but with some examples included in the test it takes sometimes more than 10 minutes. So clearly this can be done way, way better. The nested for loop is the problem but I just can't come up with a possible solution without that.
#len(word1) <= len(word2)
def check(word1, word2):
lword1 = len(word1)
lword2 = len(word2)
sword1 = sorted(word1)
sword2 = sorted(word2)
# lword1 == lword2
if lword1 == lword2:
return sword1 == sword2
# lword1 < lword2, word2 has one more character
sword2_copy = sword2.copy()
for c in sword2:
if c in sword1:
sword1.remove(c)
sword2_copy.remove(c)
return len(sword1) == 0 and len(sword2_copy) == 1
def main(fin, fout, k):
words = [line.rstrip('\n') for line in open(fin)]
words = [x.strip(' ') for x in words]
d = {}
for w1 in words:
for w2 in words:
if len(w1) == len(w2) or len(w1) == len(w2) - 1:
if check(w1, w2):
if w1 not in d.keys():
d[w1] = [w2]
else:
d[w1].append(w2)
highV = list(d.values())[0]
highK = list(d.keys())[0]
for key, value in d.items():
if len(value) > len(highV) or (len(value) == len(highV) and key < highK):
highK = key
highV = value
highV.sort()
with open(fout, 'w') as f:
for i in range(len(highV)):
f.write(highV[i]+' ')
if (i + 1) % k == 0:
f.write('\n')
return int(len(highV))
You should check out the Counter from collections:
from collections import Counter
str = 'Testanagram'
counter = Counter(str)
print(counter)
> Counter({'a': 3, 'T': 1, 'e': 1, 's': 1, 't': 1, 'n': 1, 'g': 1, 'r': 1, 'm': 1})
Using this, you should be much faster - you can also subtract one counter from another to get the diff
This seems to work pretty fast, although for my list of words (235,886 of 'em, the full list from /usr/share/dict/words) it takes around 2 seconds so it might still be too slow for you. But– honestly, how often do you plan to run it on the entire list?
with open('/usr/share/dict/words', 'r') as f:
wordlist = f.readlines()
wordlist = [word.strip() for word in wordlist]
wordlist = [(word,''.join(sorted([kar for kar in word]))) for word in wordlist]
worddict = {}
for word in wordlist:
if word[1] in worddict:
worddict[word[1]].append(word[0])
else:
worddict[word[1]] = [word[0]]
for word in wordlist:
if len(worddict[word[1]]) > 1:
print (worddict[word[1]])
Result:
['aal', 'ala']
['aam', 'ama']
['aba', 'baa']
['abac', 'caba']
['abactor', 'acrobat']
['abaft', 'bafta']
['abalone', 'balonea']
...
(27,390 lines omitted for brevity)
['ozotype', 'zootype']
['gazy', 'zyga']
['glazy', 'zygal']
[Finished in 2.1s]
It creates a dictionary with the sorted characters of each word as the key and a list containing just that word itself as its initial value. If the key is already present in the dictionary, the word is appended to its list. Then it's only a matter of printing all lists longer than 1 item.
A side effect is that all anagram words appear multiple times. That's logic for you: incomputable, which is in this word list, is an anagram of uncompatible, and therefore uncompatible (per definition also in this list) is an anagram of incomputable. QED.¹
The largest set of anagram words it finds is this one:
['angor', 'argon', 'goran', 'grano', 'groan', 'nagor', 'orang', 'organ', 'rogan']
and an interesting pair of opposites is
['misrepresentation', 'representationism']
The list even contains the word `pythonic':
['hypnotic', 'phytonic', 'pythonic', 'typhonic']
¹ After trying: printing each combination only once appeared to be trivial. It reduces the list to 12,189 'unique' sets, and the check took another 0.1 second.
The main optimisation will be in check().
Using something like collections.Counter, the solution looks simpler, but is slower:
def check_with_counter(word1, word2):
c1 = Counter(word1)
c2 = Counter(word2)
return sum(((c1 - c2) + (c2 - c1)).values()) < 2
A solution similar to yours, but considerably faster (by about an order of magnitude)
def check_faster(word1, word2):
# checks faster than check() and works for words of any length (in any order)
ld = len(word1) - len(word2)
if ld in [0, 1]:
sword_long = list(word1)
sword_short = list(word2)
if ld == 0:
return sword_long == sword_short
elif ld == -1:
sword_long = list(word2)
sword_short = list(word1)
else:
return False
for c in sword_short:
try:
sword_long.remove(c)
except ValueError:
pass
return len(sword_long) < 2
And putting it to use in a somewhat faster run():
def run_faster(fin, fout, k):
words = [line.rstrip('\n') for line in open(fin)]
words = [x.strip(' ') for x in words]
d = {}
for w1 in words:
for w2 in words:
if check_faster(w1, w2):
if w1 not in d.keys():
d[w1] = [w2]
else:
d[w1].append(w2)
most = 0
most_anagrams = []
for word, anagrams in d.items():
if len(anagrams) > most:
most = len(anagrams)
most_anagrams = anagrams
most_anagrams.sort()
with open(fout, 'w') as f:
for i in range(len(most_anagrams)):
f.write(most_anagrams[i]+' ')
if (i + 1) % k == 0:
f.write('\n')
return int(len(most_anagrams))

Find common substring between two strings

I'd like to compare 2 strings and keep the matched, splitting off where the comparison fails.
So if I have 2 strings:
string1 = "apples"
string2 = "appleses"
answer = "apples"
Another example, as the string could have more than one word:
string1 = "apple pie available"
string2 = "apple pies"
answer = "apple pie"
I'm sure there is a simple Python way of doing this but I can't work it out, any help and explanation appreciated.
For completeness, difflib in the standard-library provides loads of sequence-comparison utilities. For instance find_longest_match which finds the longest common substring when used on strings. Example use:
from difflib import SequenceMatcher
string1 = "apple pie available"
string2 = "come have some apple pies"
match = SequenceMatcher(None, string1, string2).find_longest_match()
print(match) # -> Match(a=0, b=15, size=9)
print(string1[match.a:match.a + match.size]) # -> apple pie
print(string2[match.b:match.b + match.size]) # -> apple pie
If you're using a version older than 3.9, you'need to call find_longest_match() with the following arguments:
SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
One might also consider os.path.commonprefix that works on characters and thus can be used for any strings.
import os
common = os.path.commonprefix(['apple pie available', 'apple pies'])
assert common == 'apple pie'
As the function name indicates, this only considers the common prefix of two strings.
def common_start(sa, sb):
""" returns the longest common substring from the beginning of sa and sb """
def _iter():
for a, b in zip(sa, sb):
if a == b:
yield a
else:
return
return ''.join(_iter())
>>> common_start("apple pie available", "apple pies")
'apple pie'
Or a slightly stranger way:
def stop_iter():
"""An easy way to break out of a generator"""
raise StopIteration
def common_start(sa, sb):
return ''.join(a if a == b else stop_iter() for a, b in zip(sa, sb))
Which might be more readable as
def terminating(cond):
"""An easy way to break out of a generator"""
if cond:
return True
raise StopIteration
def common_start(sa, sb):
return ''.join(a for a, b in zip(sa, sb) if terminating(a == b))
Its called Longest Common Substring problem. Here I present a simple, easy to understand but inefficient solution. It will take a long time to produce correct output for large strings, as the complexity of this algorithm is O(N^2).
def longestSubstringFinder(string1, string2):
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
match = ""
for j in range(len2):
if (i + j < len1 and string1[i + j] == string2[j]):
match += string2[j]
else:
if (len(match) > len(answer)): answer = match
match = ""
return answer
print(longestSubstringFinder("apple pie available", "apple pies"))
print(longestSubstringFinder("apples", "appleses"))
print(longestSubstringFinder("bapples", "cappleses"))
Output
apple pie
apples
apples
Fix bugs with the first's answer:
def longestSubstringFinder(string1, string2):
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp = 0
match = ''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp += 1
if len(match) > len(answer):
answer = match
return answer
print(longestSubstringFinder("dd apple pie available", "apple pies"))
print(longestSubstringFinder("cov_basic_as_cov_x_gt_y_rna_genes_w1000000", "cov_rna15pcs_as_cov_x_gt_y_rna_genes_w1000000")
print(longestSubstringFinder("bapples", "cappleses"))
print(longestSubstringFinder("apples", "apples"))
The same as Evo's, but with arbitrary number of strings to compare:
def common_start(*strings):
""" Returns the longest common substring
from the beginning of the `strings`
"""
def _iter():
for z in zip(*strings):
if z.count(z[0]) == len(z): # check all elements in `z` are the same
yield z[0]
else:
return
return ''.join(_iter())
The fastest way I've found is to use suffix_trees package:
from suffix_trees import STree
a = ["xxxabcxxx", "adsaabc"]
st = STree.STree(a)
print(st.lcs()) # "abc"
This script requests you the minimum common substring length and gives all common substrings in two strings. Also, it eliminates shorter substrings that longer substrings include already.
def common_substrings(str1,str2):
len1,len2=len(str1),len(str2)
if len1 > len2:
str1,str2=str2,str1
len1,len2=len2,len1
#short string=str1 and long string=str2
min_com = int(input('Please enter the minumum common substring length:'))
cs_array=[]
for i in range(len1,min_com-1,-1):
for k in range(len1-i+1):
if (str1[k:i+k] in str2):
flag=1
for m in range(len(cs_array)):
if str1[k:i+k] in cs_array[m]:
#print(str1[k:i+k])
flag=0
break
if flag==1:
cs_array.append(str1[k:i+k])
if len(cs_array):
print(cs_array)
else:
print('There is no any common substring according to the parametres given')
common_substrings('ciguliuana','ciguana')
common_substrings('apples','appleses')
common_substrings('apple pie available','apple pies')
Try:
import itertools as it
''.join(el[0] for el in it.takewhile(lambda t: t[0] == t[1], zip(string1, string2)))
It does the comparison from the beginning of both strings.
def matchingString(x,y):
match=''
for i in range(0,len(x)):
for j in range(0,len(y)):
k=1
# now applying while condition untill we find a substring match and length of substring is less than length of x and y
while (i+k <= len(x) and j+k <= len(y) and x[i:i+k]==y[j:j+k]):
if len(match) <= len(x[i:i+k]):
match = x[i:i+k]
k=k+1
return match
print matchingString('apple','ale') #le
print matchingString('apple pie available','apple pies') #apple pie
A Trie data structure would work the best, better than DP.
Here is the code.
class TrieNode:
def __init__(self):
self.child = [None]*26
self.endWord = False
class Trie:
def __init__(self):
self.root = self.getNewNode()
def getNewNode(self):
return TrieNode()
def insert(self,value):
root = self.root
for i,character in enumerate(value):
index = ord(character) - ord('a')
if not root.child[index]:
root.child[index] = self.getNewNode()
root = root.child[index]
root.endWord = True
def search(self,value):
root = self.root
for i,character in enumerate(value):
index = ord(character) - ord('a')
if not root.child[index]:
return False
root = root.child[index]
return root.endWord
def main():
# Input keys (use only 'a' through 'z' and lower case)
keys = ["the","anaswe"]
output = ["Not present in trie",
"Present in trie"]
# Trie object
t = Trie()
# Construct trie
for key in keys:
t.insert(key)
# Search for different keys
print("{} ---- {}".format("the",output[t.search("the")]))
print("{} ---- {}".format("these",output[t.search("these")]))
print("{} ---- {}".format("their",output[t.search("their")]))
print("{} ---- {}".format("thaw",output[t.search("thaw")]))
if __name__ == '__main__':
main()
Let me know in case of doubts.
In case we have a list of words that we need to find all common substrings I check some of the codes above and the best was https://stackoverflow.com/a/42882629/8520109 but it has some bugs for example 'histhome' and 'homehist'. In this case, we should have 'hist' and 'home' as a result. Furthermore, it differs if the order of arguments is changed. So I change the code to find every block of substring and it results a set of common substrings:
main = input().split(" ") #a string of words separated by space
def longestSubstringFinder(string1, string2):
'''Find the longest matching word'''
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp=0
match=''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp+=1
if (len(match) > len(answer)):
answer = match
return answer
def listCheck(main):
'''control the input for finding substring in a list of words'''
string1 = main[0]
result = []
for i in range(1, len(main)):
string2 = main[i]
res1 = longestSubstringFinder(string1, string2)
res2 = longestSubstringFinder(string2, string1)
result.append(res1)
result.append(res2)
result.sort()
return result
first_answer = listCheck(main)
final_answer = []
for item1 in first_answer: #to remove some incorrect match
string1 = item1
double_check = True
for item2 in main:
string2 = item2
if longestSubstringFinder(string1, string2) != string1:
double_check = False
if double_check:
final_answer.append(string1)
print(set(final_answer))
main = 'ABACDAQ BACDAQA ACDAQAW XYZCDAQ' #>>> {'CDAQ'}
main = 'homehist histhome' #>>> {'hist', 'home'}
def LongestSubString(s1,s2):
if len(s1)<len(s2) :
s1,s2 = s2,s1
maxsub =''
for i in range(len(s2)):
for j in range(len(s2),i,-1):
if s2[i:j] in s1 and j-i>len(maxsub):
return s2[i:j]
Returns the first longest common substring:
def compareTwoStrings(string1, string2):
list1 = list(string1)
list2 = list(string2)
match = []
output = ""
length = 0
for i in range(0, len(list1)):
if list1[i] in list2:
match.append(list1[i])
for j in range(i + 1, len(list1)):
if ''.join(list1[i:j]) in string2:
match.append(''.join(list1[i:j]))
else:
continue
else:
continue
for string in match:
if length < len(list(string)):
length = len(list(string))
output = string
else:
continue
return output
**Return the comman longest substring**
def longestSubString(str1, str2):
longestString = ""
maxLength = 0
for i in range(0, len(str1)):
if str1[i] in str2:
for j in range(i + 1, len(str1)):
if str1[i:j] in str2:
if(len(str1[i:j]) > maxLength):
maxLength = len(str1[i:j])
longestString = str1[i:j]
return longestString
This is the classroom problem called 'Longest sequence finder'. I have given some simple code that worked for me, also my inputs are lists of a sequence which can also be a string:
def longest_substring(list1,list2):
both=[]
if len(list1)>len(list2):
small=list2
big=list1
else:
small=list1
big=list2
removes=0
stop=0
for i in small:
for j in big:
if i!=j:
removes+=1
if stop==1:
break
elif i==j:
both.append(i)
for q in range(removes+1):
big.pop(0)
stop=1
break
removes=0
return both
As if this question doesn't have enough answers, here's another option:
from collections import defaultdict
def LongestCommonSubstring(string1, string2):
match = ""
matches = defaultdict(list)
str1, str2 = sorted([string1, string2], key=lambda x: len(x))
for i in range(len(str1)):
for k in range(i, len(str1)):
cur = match + str1[k]
if cur in str2:
match = cur
else:
match = ""
if match:
matches[len(match)].append(match)
if not matches:
return ""
longest_match = max(matches.keys())
return matches[longest_match][0]
Some example cases:
LongestCommonSubstring("whose car?", "this is my car")
> ' car'
LongestCommonSubstring("apple pies", "apple? forget apple pie!")
> 'apple pie'
This isn't the most efficient way to do it but it's what I could come up with and it works. If anyone can improve it, please do. What it does is it makes a matrix and puts 1 where the characters match. Then it scans the matrix to find the longest diagonal of 1s, keeping track of where it starts and ends. Then it returns the substring of the input string with the start and end positions as arguments.
Note: This only finds one longest common substring. If there's more than one, you could make an array to store the results in and return that Also, it's case sensitive so (Apple pie, apple pie) will return pple pie.
def longestSubstringFinder(str1, str2):
answer = ""
if len(str1) == len(str2):
if str1==str2:
return str1
else:
longer=str1
shorter=str2
elif (len(str1) == 0 or len(str2) == 0):
return ""
elif len(str1)>len(str2):
longer=str1
shorter=str2
else:
longer=str2
shorter=str1
matrix = numpy.zeros((len(shorter), len(longer)))
for i in range(len(shorter)):
for j in range(len(longer)):
if shorter[i]== longer[j]:
matrix[i][j]=1
longest=0
start=[-1,-1]
end=[-1,-1]
for i in range(len(shorter)-1, -1, -1):
for j in range(len(longer)):
count=0
begin = [i,j]
while matrix[i][j]==1:
finish=[i,j]
count=count+1
if j==len(longer)-1 or i==len(shorter)-1:
break
else:
j=j+1
i=i+1
i = i-count
if count>longest:
longest=count
start=begin
end=finish
break
answer=shorter[int(start[0]): int(end[0])+1]
return answer
First a helper function adapted from the itertools pairwise recipe to produce substrings.
import itertools
def n_wise(iterable, n = 2):
'''n = 2 -> (s0,s1), (s1,s2), (s2, s3), ...
n = 3 -> (s0,s1, s2), (s1,s2, s3), (s2, s3, s4), ...'''
a = itertools.tee(iterable, n)
for x, thing in enumerate(a[1:]):
for _ in range(x+1):
next(thing, None)
return zip(*a)
Then a function the iterates over substrings, longest first, and tests for membership. (efficiency not considered)
def foo(s1, s2):
'''Finds the longest matching substring
'''
# the longest matching substring can only be as long as the shortest string
#which string is shortest?
shortest, longest = sorted([s1, s2], key = len)
#iterate over substrings, longest substrings first
for n in range(len(shortest)+1, 2, -1):
for sub in n_wise(shortest, n):
sub = ''.join(sub)
if sub in longest:
#return the first one found, it should be the longest
return sub
s = "fdomainster"
t = "exdomainid"
print(foo(s,t))
>>>
domain
>>>
def LongestSubString(s1,s2):
left = 0
right =len(s2)
while(left<right):
if(s2[left] not in s1):
left = left+1
else:
if(s2[left:right] not in s1):
right = right - 1
else:
return(s2[left:right])
s1 = "pineapple"
s2 = "applc"
print(LongestSubString(s1,s2))

Categories