Python compare partial string in a list with each other

Python compare partial string in a list with each other - python

I am trying to write a code to compare each string in a list to each other and then generate its regex for similarity
list = ["LONDON-UK-L16-N1",
"LONDON-UK-L17-N1",
"LONDON-UK-L16-N2",
"LONDON-UK-L17-N2",
"PARIS-France-L16-N2"]
I am trying to get an output as below
LONDON-UK-L(16|17)-N(1|2)
is that possible? thanks
Update: just to make it clear i am trying to
input: list, or strings
Action: compare list items to each other, and check for similarity (to fix it-first group of a string), and use regex for any other not similar part of item, so instead of having for items, we can have a single output (using regex)
output: regex to match not similar
input:
tez15-3-s1-y2
tez15-3-s2-y2
bro40-55-s1-y2
output:
tez15-3-s(1|2)-y2
,bro40-55-s1-y2

Its not entirely clear from your question what the exact problem is. Since the data you gave as an example is consistent and well ordered, this problem can be solved easily by simply splitting up the items in the list and categorising them.
loc_list = ["LONDON-UK-L16-N1", "LONDON-UK-L17-N1", "LONDON-UK-L16-N2",
"LONDON-UK-L16-N2", "PARIS-France-L16-N2"]
split_loc_list = [location.split("-") for location in loc_list]
locs = {}
for loc in split_loc_list:
locs.setdefault("-".join(loc[0:2]), {}).\
setdefault("L", set()).add(loc[2].strip("L"))
locs.setdefault("-".join(loc[0:2]), {}).\
setdefault("N", set()).add(loc[3].strip("N"))
for loc, vals in locs.items():
L_vals_sorted = sorted(list(map(int,vals["L"])))
L_vals_joined = "|".join(map(str,L_vals_sorted))
N_vals_sorted = sorted(list(map(int,vals["N"])))
N_vals_joined = "|".join(map(str,N_vals_sorted))
print(f"{loc}-L({L_vals_joined})-N({N_vals_joined})")
will output:
LONDON-UK-L(16|17)-N(1|2)
PARIS-France-L(16)-N(2)
Since there were only two tags here ("L" and "N"), I just wrote them into the code. If there are many tags possible, then you can strip by any letter using:
import re
split = re.findall('\d+|\D+', loc[2])
key, val = split[0], split[1]
locs.setdefault("-".join(loc[0:2]), {}).\
setdefault(key, set()).add(val)
Then iterate through all the tags instead of just fetching "L" and "N" in the second loop.

I post this new (second) implementation on this problem, I think more accurate and hope helpful:
import re
data = [
'LONDON-UK-L16-N1',
'LONDON-UK-L17-N1',
'LONDON-UK-L16-N2',
'LONDON-UK-L17-N2',
'LONDON-UK-L18-N2',
'PARIS-France-L16-N2',
]
def merge(data):
data.sort()
data = [y for y in [x.split('-') for x in data]]
for col in range(len(data[0]) - 1, -1, -1):
result = []
def add_result():
result.append([])
if headstr:
result[-1] += headstr.split('-')
if len(list(findnum)) > 1:
result[-1] += [f'{findstr}({"|".join(sorted(findnum))})']
elif len(list(findnum)) == 1:
result[-1] += [f'{findstr}{findnum[0]}']
if tailstr:
result[-1] += tailstr.split('-')
_headstr = lambda x, y: '-'.join(x[:y])
_tailstr = lambda x, y: '-'.join(x[y + 1:])
_findstr = lambda x: re.findall('(\D+)', x)[0] if re.findall('(\D+)', x) else ''
_findnum = lambda x: re.findall('(\d+)', x)[0] if re.findall('(\d+)', x) else ''
headstr = _headstr(data[0], col)
tailstr = _tailstr(data[0], col)
findstr = _findstr(data[0][col])
findnum = []
for row in data:
if headstr + findstr + tailstr != _headstr(row, col) + _findstr(row[col]) + _tailstr(row, col):
add_result()
headstr = _headstr(row, col)
tailstr = _tailstr(row, col)
findstr = _findstr(row[col])
findnum = []
if _findnum(row[col]) not in findnum:
findnum.append(_findnum(row[col]))
else:
add_result()
data = result[:]
return ['-'.join(x) for x in result]
print(merge(data)) # ['LONDON-UK-L(16|17)-N(1|2)', 'LONDON-UK-L18-N2', 'PARIS-France-L16-N2']

I've implemented the following solution:
import re
data = [
'LONDON-UK-L16-N1',
'LONDON-UK-L17-N1',
'LONDON-UK-L16-N2',
'LONDON-UK-L16-N2',
'PARIS-France-L16-N2'
]
def deconstruct(data):
data = [y for y in [x.split('-') for x in data]]
result = dict()
for x in data:
pointer = result
for y in x:
substr = re.findall('(\D+)', y)
if substr:
substr = substr[0]
if not substr in pointer:
pointer[substr] = {0: set()}
pointer = pointer[substr]
substr = re.findall('(\d+)', y)
if substr:
substr = substr[0]
pointer[0].add(substr)
return result
def construct(data, level=0):
result = []
for key in data.keys():
if key != 0:
if len(data[key][0]) == 1:
nums = list(data[key][0])[0]
elif len(data[key][0]) > 1:
nums = '(' + '|'.join(sorted(list(data[key][0]))) + ')'
else:
nums = ''
deeper_result = construct(data[key], level + 1)
if not deeper_result:
result.append([key + nums])
else:
for d in deeper_result:
result.append([key + nums] + d)
return result if level > 0 else ['-'.join(x) for x in result]
print(construct(deconstruct(data)))
# ['LONDON-UK-L(16|17)-N(1|2)', 'PARIS-France-L16-N2']

Don't use 'list' as a variable name... it's a reserved word.
import re
lst = ['LONDON-UK-L16-N1', 'LONDON-UK-L17-N1', 'LONDON-UK-L16-N2', 'LONDON-UK-L16-N2', 'PARIS-France-L16-N2']
def check_it(string):
return re.search(r'[a-zA-Z\-]*L(\d)*-N(\d)*', string)
[check_it(x).group(0) for x in lst]
will output:
['LONDON-UK-L16-N1',
'LONDON-UK-L17-N1',
'LONDON-UK-L16-N2',
'LONDON-UK-L16-N2',
'PARIS-France-L16-N2']
From there, look into groups and define a group to cover the pieces that you want to use for similarity.

Related

Regex match items in list + trailing N numbers (Python)

I have a list of expected animals:
expectedAnimals = ['cat-', 'snake-', 'hedgehog-']
Then I have a user input (in string format) that contains some or all of the expected animals from the above list follwed by N numbers. These animals are separated by random delimiting symbols (non-integer):
Examples:
inputString1 = 'cat-235##randomtext-123...snake-1,dog-2:snake-22~!cat-8844'
inputString2 = 'hedgehog-2>cat-1|snake-22#cat-2<$dog-55 snake-93242522. cat-3 .rat-2 snake-22 cat-8844'
My goal (with which I am struggling) is to write the function filterAnimals that should return the following correct results:
approvedAnimals1 = filterAnimals(inputString1)
['cat-235', 'snake-1', 'snake-22', 'cat-8844']
approvedAnimals2 = filterAnimals(inputString2):
['hedgehog-2', 'cat-1', 'snake-22', 'cat-2', 'snake-93242522', 'cat-3', 'snake-22', 'cat-8844']
My current implementation works partially but honestly I would like to re-write it from scratch:
def filterAnimals(inputString):
expectedAnimals = ['cat-', 'snake-', 'hedgehog-']
start_indexes = []
end_indexes = []
for animal in expectedAnimals:
temp_start_indexes = [i for i in range(len(inputString)) if inputString.startswith(animal, i)]
if len(temp_start_indexes) > 0:
start_indexes.append(temp_start_indexes)
for start_ind in temp_start_indexes:
for i in range(start_ind + len(animal), len(inputString)):
if inputString[i].isdigit() and i == len(inputString) - 1:
end_indexes.append(i + 1)
break
if not inputString[i].isdigit():
end_indexes.append(i)
break
start_indexes_flat = [item for sublist in start_indexes for item in sublist]
list_size = min(len(start_indexes_flat), len(end_indexes))
approvedAnimals = []
if list_size > 0:
for x in range(list_size):
approvedAnimals.append(inputString[start_indexes_flat[x]:end_indexes[x]])
return approvedAnimals

You can build an alternation pattern from expectedAnimals and use re.findall to find all matches as a list:
import re
def filterAnimals(inputString):
return re.findall(rf"(?:{'|'.join(expectedAnimals)})\d+", inputString)
Demo: https://replit.com/#blhsing/OffensiveEveryWebportal

import re
# matches expected animals followed by N numbers
pattern=re.compile("(cat|snake|hedgehog)-\d+")
inputString1 = 'cat-235##randomtext-123...snake-1,dog-2:snake-22~!cat-8844'
inputString2 = 'hedgehog-2>cat-1|snake-22#cat-2<$dog-55 snake-93242522. cat-3 .rat-2 snake-22 cat-8844'
animals_1 = [i.group() for i in pattern.finditer(inputString1)]
# will return ['cat-235', 'snake-1', 'snake-22', 'cat-8844']
animals_2 = [i.group() for i in pattern.finditer(inputString2)]
# will return ['hedgehog-2', 'cat-1', 'snake-22', 'cat-2', 'snake-93242522', 'cat-3', 'snake-22', 'cat-8844']

Replace string one by one

I have a string and I need to replace "e" with "x" one at a time. For e.g.
x = "three"
Then the expected output is:
("thrxe", "threx")
and if I have 3 characters to replace, for e.g.
y = "threee"
Then the expected output will be:
("thrxee", "threxe", "threex")
I have tried this:
x.replace("e", "x", 1) # -> 'thrxe'
But not sure how to return the second string "threx".

Try this
x = "threee"
# build a generator expression that yields the position of "e"s
# change "e"s with "x" according to location of "e"s yielded from the genexp
[f"{x[:i]}x{x[i+1:]}" for i in (i for i, e in enumerate(x) if e=='e')]
['thrxee', 'threxe', 'threex']

You could use a generator to replace e with x sequentially through the string. For example:
def replace(string, old, new):
l = len(old)
start = 0
while start != -1:
start = string.find(old, start + l)
if start != -1:
yield string[:start] + new + string[start + l:]
z = replace('threee', 'e', 'x')
for s in z:
print(s)
Output:
thrxee
threxe
threex
Note I've generalised the code to allow for arbitrary length match and replacement strings, if you don't need that just replace l (len(old)) with 1.

def replace(string,old,new):
f = string.index(old)
l = list(string)
i = 0
for a in range(string.count(old)):
l[f] = new
yield ''.join(l)
l[f]=old
try:
f = string.index(old,f+1)
except ValueError:
pass
i+=1
z = replace('threee', 'e', 'x')
for a in z:
print(a)
OUTPUT
thrxee
threxe
threex

Creating string based on first letters of each element of the list

Example:
list = [abcc, typpaw, gfssdwww]
expected result = atgbyfcpscpsadwwww
Any ideas?
This is what i made so far:
def lazy_scribe(sources: list):
result: str = ''
i = 0
while i < len(max(sources, key=len)):
for source in sources:
for char in source:
if i <= len(source):
result = result + source[int(i)]
else:
continue
i += 1 / (len(sources))
break
return result
sources = ["python", "java", "golang"]
print(lazy_scribe(sources))
print(len(sources))
result: "pjgyaoyvlhaaononngn". I dont know why there is "y" instead of t (7 char in result string)

If I understand the problem correctly, this should work.
list = ["abcc", "typpaw", "gfssdwww"]
max_len = len(max(list, key=len))
res = ""
char_iterator = 0
while char_iterator < max_len:
for word in list:
if char_iterator < len(word):
res += word[char_iterator]
char_iterator += 1
print(res)

Another possible solution is as follows:
l = ['abcc', 'typpaw', 'gfssdwww']
max_len = len(max(l, key=len))
padded_l = list(zip(*[e + " " * (max_len - len(e)) for e in l]))
''.join([''.join(e) for e in padded_l]).replace(' ', '')
find the longest string in the list
then pad all the strings in the list with blank space
use zip on the result list
join the elements and replace the blank space to get the desired result

Find common substring between two strings

I'd like to compare 2 strings and keep the matched, splitting off where the comparison fails.
So if I have 2 strings:
string1 = "apples"
string2 = "appleses"
answer = "apples"
Another example, as the string could have more than one word:
string1 = "apple pie available"
string2 = "apple pies"
answer = "apple pie"
I'm sure there is a simple Python way of doing this but I can't work it out, any help and explanation appreciated.

For completeness, difflib in the standard-library provides loads of sequence-comparison utilities. For instance find_longest_match which finds the longest common substring when used on strings. Example use:
from difflib import SequenceMatcher
string1 = "apple pie available"
string2 = "come have some apple pies"
match = SequenceMatcher(None, string1, string2).find_longest_match()
print(match) # -> Match(a=0, b=15, size=9)
print(string1[match.a:match.a + match.size]) # -> apple pie
print(string2[match.b:match.b + match.size]) # -> apple pie
If you're using a version older than 3.9, you'need to call find_longest_match() with the following arguments:
SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))

One might also consider os.path.commonprefix that works on characters and thus can be used for any strings.
import os
common = os.path.commonprefix(['apple pie available', 'apple pies'])
assert common == 'apple pie'
As the function name indicates, this only considers the common prefix of two strings.

def common_start(sa, sb):
""" returns the longest common substring from the beginning of sa and sb """
def _iter():
for a, b in zip(sa, sb):
if a == b:
yield a
else:
return
return ''.join(_iter())
>>> common_start("apple pie available", "apple pies")
'apple pie'
Or a slightly stranger way:
def stop_iter():
"""An easy way to break out of a generator"""
raise StopIteration
def common_start(sa, sb):
return ''.join(a if a == b else stop_iter() for a, b in zip(sa, sb))
Which might be more readable as
def terminating(cond):
"""An easy way to break out of a generator"""
if cond:
return True
raise StopIteration
def common_start(sa, sb):
return ''.join(a for a, b in zip(sa, sb) if terminating(a == b))

Its called Longest Common Substring problem. Here I present a simple, easy to understand but inefficient solution. It will take a long time to produce correct output for large strings, as the complexity of this algorithm is O(N^2).
def longestSubstringFinder(string1, string2):
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
match = ""
for j in range(len2):
if (i + j < len1 and string1[i + j] == string2[j]):
match += string2[j]
else:
if (len(match) > len(answer)): answer = match
match = ""
return answer
print(longestSubstringFinder("apple pie available", "apple pies"))
print(longestSubstringFinder("apples", "appleses"))
print(longestSubstringFinder("bapples", "cappleses"))
Output
apple pie
apples
apples

Fix bugs with the first's answer:
def longestSubstringFinder(string1, string2):
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp = 0
match = ''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp += 1
if len(match) > len(answer):
answer = match
return answer
print(longestSubstringFinder("dd apple pie available", "apple pies"))
print(longestSubstringFinder("cov_basic_as_cov_x_gt_y_rna_genes_w1000000", "cov_rna15pcs_as_cov_x_gt_y_rna_genes_w1000000")
print(longestSubstringFinder("bapples", "cappleses"))
print(longestSubstringFinder("apples", "apples"))

The same as Evo's, but with arbitrary number of strings to compare:
def common_start(*strings):
""" Returns the longest common substring
from the beginning of the `strings`
"""
def _iter():
for z in zip(*strings):
if z.count(z[0]) == len(z): # check all elements in `z` are the same
yield z[0]
else:
return
return ''.join(_iter())

The fastest way I've found is to use suffix_trees package:
from suffix_trees import STree
a = ["xxxabcxxx", "adsaabc"]
st = STree.STree(a)
print(st.lcs()) # "abc"

This script requests you the minimum common substring length and gives all common substrings in two strings. Also, it eliminates shorter substrings that longer substrings include already.
def common_substrings(str1,str2):
len1,len2=len(str1),len(str2)
if len1 > len2:
str1,str2=str2,str1
len1,len2=len2,len1
#short string=str1 and long string=str2
min_com = int(input('Please enter the minumum common substring length:'))
cs_array=[]
for i in range(len1,min_com-1,-1):
for k in range(len1-i+1):
if (str1[k:i+k] in str2):
flag=1
for m in range(len(cs_array)):
if str1[k:i+k] in cs_array[m]:
#print(str1[k:i+k])
flag=0
break
if flag==1:
cs_array.append(str1[k:i+k])
if len(cs_array):
print(cs_array)
else:
print('There is no any common substring according to the parametres given')
common_substrings('ciguliuana','ciguana')
common_substrings('apples','appleses')
common_substrings('apple pie available','apple pies')

Try:
import itertools as it
''.join(el[0] for el in it.takewhile(lambda t: t[0] == t[1], zip(string1, string2)))
It does the comparison from the beginning of both strings.

def matchingString(x,y):
match=''
for i in range(0,len(x)):
for j in range(0,len(y)):
k=1
# now applying while condition untill we find a substring match and length of substring is less than length of x and y
while (i+k <= len(x) and j+k <= len(y) and x[i:i+k]==y[j:j+k]):
if len(match) <= len(x[i:i+k]):
match = x[i:i+k]
k=k+1
return match
print matchingString('apple','ale') #le
print matchingString('apple pie available','apple pies') #apple pie

A Trie data structure would work the best, better than DP.
Here is the code.
class TrieNode:
def __init__(self):
self.child = [None]*26
self.endWord = False
class Trie:
def __init__(self):
self.root = self.getNewNode()
def getNewNode(self):
return TrieNode()
def insert(self,value):
root = self.root
for i,character in enumerate(value):
index = ord(character) - ord('a')
if not root.child[index]:
root.child[index] = self.getNewNode()
root = root.child[index]
root.endWord = True
def search(self,value):
root = self.root
for i,character in enumerate(value):
index = ord(character) - ord('a')
if not root.child[index]:
return False
root = root.child[index]
return root.endWord
def main():
# Input keys (use only 'a' through 'z' and lower case)
keys = ["the","anaswe"]
output = ["Not present in trie",
"Present in trie"]
# Trie object
t = Trie()
# Construct trie
for key in keys:
t.insert(key)
# Search for different keys
print("{} ---- {}".format("the",output[t.search("the")]))
print("{} ---- {}".format("these",output[t.search("these")]))
print("{} ---- {}".format("their",output[t.search("their")]))
print("{} ---- {}".format("thaw",output[t.search("thaw")]))
if __name__ == '__main__':
main()
Let me know in case of doubts.

In case we have a list of words that we need to find all common substrings I check some of the codes above and the best was https://stackoverflow.com/a/42882629/8520109 but it has some bugs for example 'histhome' and 'homehist'. In this case, we should have 'hist' and 'home' as a result. Furthermore, it differs if the order of arguments is changed. So I change the code to find every block of substring and it results a set of common substrings:
main = input().split(" ") #a string of words separated by space
def longestSubstringFinder(string1, string2):
'''Find the longest matching word'''
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp=0
match=''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp+=1
if (len(match) > len(answer)):
answer = match
return answer
def listCheck(main):
'''control the input for finding substring in a list of words'''
string1 = main[0]
result = []
for i in range(1, len(main)):
string2 = main[i]
res1 = longestSubstringFinder(string1, string2)
res2 = longestSubstringFinder(string2, string1)
result.append(res1)
result.append(res2)
result.sort()
return result
first_answer = listCheck(main)
final_answer = []
for item1 in first_answer: #to remove some incorrect match
string1 = item1
double_check = True
for item2 in main:
string2 = item2
if longestSubstringFinder(string1, string2) != string1:
double_check = False
if double_check:
final_answer.append(string1)
print(set(final_answer))
main = 'ABACDAQ BACDAQA ACDAQAW XYZCDAQ' #>>> {'CDAQ'}
main = 'homehist histhome' #>>> {'hist', 'home'}

def LongestSubString(s1,s2):
if len(s1)<len(s2) :
s1,s2 = s2,s1
maxsub =''
for i in range(len(s2)):
for j in range(len(s2),i,-1):
if s2[i:j] in s1 and j-i>len(maxsub):
return s2[i:j]

Returns the first longest common substring:
def compareTwoStrings(string1, string2):
list1 = list(string1)
list2 = list(string2)
match = []
output = ""
length = 0
for i in range(0, len(list1)):
if list1[i] in list2:
match.append(list1[i])
for j in range(i + 1, len(list1)):
if ''.join(list1[i:j]) in string2:
match.append(''.join(list1[i:j]))
else:
continue
else:
continue
for string in match:
if length < len(list(string)):
length = len(list(string))
output = string
else:
continue
return output

**Return the comman longest substring**
def longestSubString(str1, str2):
longestString = ""
maxLength = 0
for i in range(0, len(str1)):
if str1[i] in str2:
for j in range(i + 1, len(str1)):
if str1[i:j] in str2:
if(len(str1[i:j]) > maxLength):
maxLength = len(str1[i:j])
longestString = str1[i:j]
return longestString

This is the classroom problem called 'Longest sequence finder'. I have given some simple code that worked for me, also my inputs are lists of a sequence which can also be a string:
def longest_substring(list1,list2):
both=[]
if len(list1)>len(list2):
small=list2
big=list1
else:
small=list1
big=list2
removes=0
stop=0
for i in small:
for j in big:
if i!=j:
removes+=1
if stop==1:
break
elif i==j:
both.append(i)
for q in range(removes+1):
big.pop(0)
stop=1
break
removes=0
return both

As if this question doesn't have enough answers, here's another option:
from collections import defaultdict
def LongestCommonSubstring(string1, string2):
match = ""
matches = defaultdict(list)
str1, str2 = sorted([string1, string2], key=lambda x: len(x))
for i in range(len(str1)):
for k in range(i, len(str1)):
cur = match + str1[k]
if cur in str2:
match = cur
else:
match = ""
if match:
matches[len(match)].append(match)
if not matches:
return ""
longest_match = max(matches.keys())
return matches[longest_match][0]
Some example cases:
LongestCommonSubstring("whose car?", "this is my car")
> ' car'
LongestCommonSubstring("apple pies", "apple? forget apple pie!")
> 'apple pie'

This isn't the most efficient way to do it but it's what I could come up with and it works. If anyone can improve it, please do. What it does is it makes a matrix and puts 1 where the characters match. Then it scans the matrix to find the longest diagonal of 1s, keeping track of where it starts and ends. Then it returns the substring of the input string with the start and end positions as arguments.
Note: This only finds one longest common substring. If there's more than one, you could make an array to store the results in and return that Also, it's case sensitive so (Apple pie, apple pie) will return pple pie.
def longestSubstringFinder(str1, str2):
answer = ""
if len(str1) == len(str2):
if str1==str2:
return str1
else:
longer=str1
shorter=str2
elif (len(str1) == 0 or len(str2) == 0):
return ""
elif len(str1)>len(str2):
longer=str1
shorter=str2
else:
longer=str2
shorter=str1
matrix = numpy.zeros((len(shorter), len(longer)))
for i in range(len(shorter)):
for j in range(len(longer)):
if shorter[i]== longer[j]:
matrix[i][j]=1
longest=0
start=[-1,-1]
end=[-1,-1]
for i in range(len(shorter)-1, -1, -1):
for j in range(len(longer)):
count=0
begin = [i,j]
while matrix[i][j]==1:
finish=[i,j]
count=count+1
if j==len(longer)-1 or i==len(shorter)-1:
break
else:
j=j+1
i=i+1
i = i-count
if count>longest:
longest=count
start=begin
end=finish
break
answer=shorter[int(start[0]): int(end[0])+1]
return answer

First a helper function adapted from the itertools pairwise recipe to produce substrings.
import itertools
def n_wise(iterable, n = 2):
'''n = 2 -> (s0,s1), (s1,s2), (s2, s3), ...
n = 3 -> (s0,s1, s2), (s1,s2, s3), (s2, s3, s4), ...'''
a = itertools.tee(iterable, n)
for x, thing in enumerate(a[1:]):
for _ in range(x+1):
next(thing, None)
return zip(*a)
Then a function the iterates over substrings, longest first, and tests for membership. (efficiency not considered)
def foo(s1, s2):
'''Finds the longest matching substring
'''
# the longest matching substring can only be as long as the shortest string
#which string is shortest?
shortest, longest = sorted([s1, s2], key = len)
#iterate over substrings, longest substrings first
for n in range(len(shortest)+1, 2, -1):
for sub in n_wise(shortest, n):
sub = ''.join(sub)
if sub in longest:
#return the first one found, it should be the longest
return sub
s = "fdomainster"
t = "exdomainid"
print(foo(s,t))
>>>
domain
>>>

def LongestSubString(s1,s2):
left = 0
right =len(s2)
while(left<right):
if(s2[left] not in s1):
left = left+1
else:
if(s2[left:right] not in s1):
right = right - 1
else:
return(s2[left:right])
s1 = "pineapple"
s2 = "applc"
print(LongestSubString(s1,s2))

need to list an image seq with missing image numbers in python

I can list the image files in a directory and they look like this:
['IMG_3842.CR2', 'IMG_3843.CR2', 'IMG_3844.CR2', 'IMG_3846.CR2', 'IMG_3847.CR2',
'IMG_3848.CR2', 'IMG_3849.CR2', 'IMG_3850.CR2', 'IMG_3851.CR2', 'IMG_3852.CR2',
'IMG_3855.CR2', 'IMG_3856.CR2', 'IMG_3857.CR2', 'IMG_3858.CR2', 'IMG_3859.CR2']
The file numbers range from 3842 to 3859. However, there are gaps in the numbers.
What I need to do is get a list like this as an output:
3842-3844, 3846-3852, 3855-3859

Assuming the list is already ordered, and all items have the same format
l = ['IMG_3842.CR2', 'IMG_3843.CR2', 'IMG_3844.CR2', 'IMG_3846.CR2', 'IMG_3847.CR2', 'IMG_3848.CR2', 'IMG_3849.CR2', 'IMG_3850.CR2', 'IMG_3851.CR2', 'IMG_3852.CR2', 'IMG_3855.CR2', 'IMG_3856.CR2', 'IMG_3857.CR2', 'IMG_3858.CR2', 'IMG_3859.CR2']
numl = [int(x[4:8]) for x in l]
first = numl[0]
result = []
for i in range(len(numl))[1:]:
if numl[i] != numl[i-1] + 1:
result.append(str(first) + '-' + str(numl[i-1]))
first = numl[i]
result.append(str(first) + '-' + str(numl[-1]))
print result
Output:
['3842-3844', '3846-3852', '3855-3859']

Referring to Python splitting list based on missing numbers in a sequence
import glob
from itertools import groupby
def ranges(seq, key=int):
return [[x[1] for x in g] for k, g in groupby(enumerate(seq), lambda i,x:i-key(x))]
files = list(glob.glob('*.cr2'))
files.sort()
Now you can call
ranges(files, key=lambda s: int(s[4:8]))
and get
[
['IMG_3842.CR2',
'IMG_3843.CR2',
'IMG_3844.CR2'],
['IMG_3846.CR2',
'IMG_3847.CR2',
'IMG_3848.CR2',
'IMG_3849.CR2',
'IMG_3850.CR2',
'IMG_3851.CR2',
'IMG_3852.CR2'],
['IMG_3855.CR2',
'IMG_3856.CR2',
'IMG_3857.CR2',
'IMG_3858.CR2',
'IMG_3859.CR2']
]
Edit:
filenums = [f[4:8] for f in files]
range_strings = ["{}-{}".format(rng[0], rng[-1]) for rng in ranges(filenums)]
gives
['3842-3844', '3846-3852', '3855-3859']

Something like this:
[Iteration is straighforward, I just wanted to try it the reduce way ;)]
S = ['IMG_3842.CR2', 'IMG_3843.CR2', 'IMG_3844.CR2', 'IMG_3846.CR2', 'IMG_3847.CR2', 'IMG_3848.CR2', 'IMG_3849.CR2', 'IMG_3850.CR2', 'IMG_3851.CR2', 'IMG_3852.CR2', 'IMG_3855.CR2', 'IMG_3856.CR2', 'IMG_3857.CR2', 'IMG_3858.CR2', 'IMG_3859.CR2', 'IMG_3863.CR2']
l = sorted([int(x[x.index("_")+1:x.index(".")]) for x in S])
def func(l,val):
if val == l[-1][1]+1:
l[-1] = (l[-1][0], l[-1][1]+1)
else:
l.append((val,val))
return l
res = reduce(func, l, [(-1,-1)])[1:]
print [str(x[0]) if x[0] == x[1] else "{}-{}".format(x[0],x[1]) for x in res]
OUTPUTS:
['3842-3844', '3846-3852', '3855-3859', '3863']

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python compare partial string in a list with each other - python

Related

Regex match items in list + trailing N numbers (Python)

Replace string one by one

Creating string based on first letters of each element of the list

Find common substring between two strings

need to list an image seq with missing image numbers in python

Categories

Resources