import re
string = "is2 Thi1s T4est 3a"
def order(sentence):
res = ''
count = 1
list = sentence.split()
for i in list:
for i in list:
a = re.findall('\d+', i)
if a == [str(count)]:
res += " ".join(i)
count += 1
print(res)
order(string)
Above there is a code which I have problem with. Output which I should get is:
"Thi1s is2 3a T4est"
Instead I'm getting the correct order but with spaces in the wrong places:
"T h i 1 si s 23 aT 4 e s t"
Any idea how to make it work with this code concept?
You are joining the characters of each word:
>>> " ".join('Thi1s')
'T h i 1 s'
You want to collect your words into a list and join that instead:
def order(sentence):
number_words = []
count = 1
words = sentence.split()
for word in words:
for word in words:
matches = re.findall('\d+', word)
if matches == [str(count)]:
number_words.append(word)
count += 1
result = ' '.join(number_words)
print(result)
I used more verbose and clear variable names. I also removed the list variable; don't use list as a variable name if you can avoid it, as that masks the built-in list name.
What you implemented comes down to a O(N^2) (quadratic time) sort. You could instead use the built-in sort() function to bring this to O(NlogN); you'd extract the digit and sort on its integer value:
def order(sentence):
digit = re.compile(r'\d+')
return ' '.join(
sorted(sentence.split(),
key=lambda w: int(digit.search(w).group())))
This differs a little from your version in that it'll only look at the first (consecutive) digits, it doesn't care about the numbers being sequential, and will break for words without digits. It also uses a return to give the result to the caller rather than print. Just use print(order(string)) to print the return value.
If you assume the words are numbered consecutively starting at 1, then you can sort them in O(N) time even:
def order(sentence):
digit = re.compile(r'\d+')
words = sentence.split()
result = [None] * len(words)
for word in words:
index = int(digit.search(word).group())
result[index - 1] = word
return ' '.join(result)
This works by creating a list of the same length, then using the digits from each word to put the word into the correct index (minus 1, as Python lists start at 0, not 1).
I think the bug is simply in the misuse of join(). You want to concatenate the current sorted string. i is simply a token, hence simply add it to the end of the string. Code untested.
import re
string = "is2 Thi1s T4est 3a"
def order(sentence):
res = ''
count = 1
list = sentence.split()
for i in list:
for i in list:
a = re.findall('\d+', i)
if a == [str(count)]:
res = res + " " + i # your bug here
count += 1
print(res)
order(string)
Related
Input:
string = "My dear adventurer, do you understand the nature of the given discussion?"
expected output:
string = 'My dear ##########, do you ########## the nature ## the given ##########?'
How can you replace the third word in a string of words with the # length equivalent of that word while avoiding counting special characters found in the string such as apostrophes('), quotations("), full stops(.), commas(,), exclamations(!), question marks(?), colons(:) and semicolons (;).
I took the approach of converting the string to a list of elements but am finding difficulty filtering out the special characters and replacing the words with the # equivalent. Is there a better way to go about it?
I solved it with:
s = "My dear adventurer, do you understand the nature of the given discussion?"
def replace_alphabet_with_char(word: str, replacement: str) -> str:
new_word = []
alphabet = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
for c in word:
if c in alphabet:
new_word.append(replacement)
else:
new_word.append(c)
return "".join(new_word)
every_nth_word = 3
s_split = s.split(' ')
result = " ".join([replace_alphabet_with_char(s_split[i], '#') if i % every_nth_word == every_nth_word - 1 else s_split[i] for i in range(len(s_split))])
print(result)
Output:
My dear ##########, do you ########## the nature ## the given ##########?
There are more efficient ways to solve this question, but I hope this is the simplest!
My approach is:
Split the sentence into a list of the words
Using that, make a list of every third word.
Remove unwanted characters from this
Replace third words in original string with # times the length of the word.
Here's the code (explained in comments) :
# original line
line = "My dear adventurer, do you understand the nature of the given discussion?"
# printing original line
print(f'\n\nOriginal Line:\n"{line}"\n')
# printing somehting to indicate that next few prints will be for showing what is happenning after each lone
print('\n\nStages of parsing:')
# splitting by spaces, into list
wordList = line.split(' ')
# printing wordlist
print(wordList)
# making list of every third word
thirdWordList = [wordList[i-1] for i in range(1,len(wordList)+1) if i%3==0]
# pritning third-word list
print(thirdWordList)
# characters that you don't want hashed
unwantedCharacters = ['.','/','|','?','!','_','"',',','-','#','\n','\\',':',';','(',')','<','>','{','}','[',']','%','*','&','+']
# replacing these characters by empty strings in the list of third-words
for unwantedchar in unwantedCharacters:
for i in range(0,len(thirdWordList)):
thirdWordList[i] = thirdWordList[i].replace(unwantedchar,'')
# printing third word list, now without punctuation
print(thirdWordList)
# replacing with #
for word in thirdWordList:
line = line.replace(word,len(word)*'#')
# Voila! Printing the result:
print(f'\n\nFinal Output:\n"{line}"\n\n')
Hope this helps!
Following works and does not use regular expressions
special_chars = {'.','/','|','?','!','_','"',',','-','#','\n','\\'}
def format_word(w, fill):
if w[-1] in special_chars:
return fill*(len(w) - 1) + w[-1]
else:
return fill*len(w)
def obscure(string, every=3, fill='#'):
return ' '.join(
(format_word(w, fill) if (i+1) % every == 0 else w)
for (i, w) in enumerate(string.split())
)
Here are some example usage
In [15]: obscure(string)
Out[15]: 'My dear ##########, do you ########## the nature ## the given ##########?'
In [16]: obscure(string, 4)
Out[16]: 'My dear adventurer, ## you understand the ###### of the given ##########?'
In [17]: obscure(string, 3, '?')
Out[17]: 'My dear ??????????, do you ?????????? the nature ?? the given ???????????'
With help of some regex. Explanation in the comments.
import re
imp = "My dear adventurer, do you understand the nature of the given discussion?"
every_nth = 3 # in case you want to change this later
out_list = []
# split the input at spaces, enumerate the parts for looping
for idx, word in enumerate(imp.split(' ')):
# only do the special logic for multiples of n (0-indexed, thus +1)
if (idx + 1) % every_nth == 0:
# find how many special chars there are in the current segment
len_special_chars = len(re.findall(r'[.,!?:;\'"]', word))
# ^ add more special chars here if needed
# subtract the number of special chars from the length of segment
str_len = len(word) - len_special_chars
# repeat '#' for every non-special char and add the special chars
out_list.append('#'*str_len + word[-len_special_chars] if len_special_chars > 0 else '')
else:
# if the index is not a multiple of n, just add the word
out_list.append(word)
print(' '.join(out_list))
A mixed of regex and string manipulation
import re
string = "My dear adventurer, do you understand the nature of the given discussion?"
new_string = []
for i, s in enumerate(string.split()):
if (i+1) % 3 == 0:
s = re.sub(r'[^\.:,;\'"!\?]', '#', s)
new_string.append(s)
new_string = ' '.join(new_string)
print(new_string)
This question was asked in an exam but my code (given below) passed just 2 cases out of 7 cases.
Input Format : single line input seperated by comma
Input: str = “abcd,b”
Output: 6
“ab”, “abc”, “abcd”, “b”, “bc” and “bcd” are the required sub-strings.
def slicing(s, k, n):
loop_value = n - k + 1
res = []
for i in range(loop_value):
res.append(s[i: i + k])
return res
x, y = input().split(',')
n = len(x)
res1 = []
for i in range(1, n + 1):
res1 += slicing(x, i, n)
count = 0
for ele in res1:
if y in ele:
count += 1
print(count)
When the target string (ts) is found in the string S, you can compute the number of substrings containing that instance by multiplying the number of characters before the target by the number of characters after the target (plus one on each side).
This will cover all substrings that contain this instance of the target string leaving only the "after" part to analyse further, which you can do recursively.
def countsubs(S,ts):
if ts not in S: return 0 # shorter or no match
before,after = S.split(ts,1) # split on target
result = (len(before)+1)*(len(after)+1) # count for this instance
return result + countsubs(ts[1:]+after,ts) # recurse with right side
print(countsubs("abcd","b")) # 6
This will work for single character and multi-character targets and will run much faster than checking all combinations of substrings one by one.
Here is a simple solution without recursion:
def my_function(s):
l, target = s.split(',')
result = []
for i in range(len(l)):
for j in range(i+1, len(l)+1):
ss = l[i] + l[i+1:j]
if target in ss:
result.append(ss)
return f'count = {len(result)}, substrings = {result}'
print(my_function("abcd,b"))
#count = 6, substrings = ['ab', 'abc', 'abcd', 'b', 'bc', 'bcd']
Here you go, this should help
from itertools import combinations
output = []
initial = input('Enter string and needed letter seperated by commas: ') #Asking for input
list1 = initial.split(',') #splitting the input into two parts i.e the actual text and the letter we want common in output
text = list1[0]
final = [''.join(l) for i in range(len(text)) for l in combinations(text, i+1)] #this is the core part of our code, from this statement we get all the available combinations of the set of letters (all the way from 1 letter combinations to nth letter)
for i in final:
if 'b' in i:
output.append(i) #only outputting the results which have the required letter/phrase in it
I have a Python list of string names where I would like to remove a common substring from all of the names.
And after reading this similar answer I could almost achieve the desired result using SequenceMatcher.
But only when all items have a common substring:
From List:
string 1 = myKey_apples
string 2 = myKey_appleses
string 3 = myKey_oranges
common substring = "myKey_"
To List:
string 1 = apples
string 2 = appleses
string 3 = oranges
However I have a slightly noisy list that contains a few scattered items that don't fit the same naming convention.
I would like to remove the "most common" substring from the majority:
From List:
string 1 = myKey_apples
string 2 = myKey_appleses
string 3 = myKey_oranges
string 4 = foo
string 5 = myKey_Banannas
common substring = ""
To List:
string 1 = apples
string 2 = appleses
string 3 = oranges
string 4 = foo
string 5 = Banannas
I need a way to match the "myKey_" substring so I can remove it from all names.
But when I use the SequenceMatcher the item "foo" causes the "longest match" to be equal to blank "".
I think the only way to solve this is to find the "most common substring". But how could that be accomplished?
Basic example code:
from difflib import SequenceMatcher
names = ["myKey_apples",
"myKey_appleses",
"myKey_oranges",
#"foo",
"myKey_Banannas"]
string2 = names[0]
for i in range(1, len(names)):
string1 = string2
string2 = names[i]
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print(string1[match.a: match.a + match.size]) # -> myKey_
Given names = ["myKey_apples", "myKey_appleses", "myKey_oranges", "foo", "myKey_Banannas"]
An O(n^2) solution I can think of is to find all possible substrings and storing them in a dictionary with the number of times they occur :
substring_counts={}
for i in range(0, len(names)):
for j in range(i+1,len(names)):
string1 = names[i]
string2 = names[j]
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
matching_substring=string1[match.a:match.a+match.size]
if(matching_substring not in substring_counts):
substring_counts[matching_substring]=1
else:
substring_counts[matching_substring]+=1
print(substring_counts) #{'myKey_': 5, 'myKey_apples': 1, 'o': 1, '': 3}
And then picking the maximum occurring substring
import operator
max_occurring_substring=max(substring_counts.iteritems(), key=operator.itemgetter(1))[0]
print(max_occurring_substring) #myKey_
Here's a overly verbose solution to your problem:
def find_matching_key(list_in, max_key_only = True):
"""
returns the longest matching key in the list * with the highest frequency
"""
keys = {}
curr_key = ''
# If n does not exceed max_n, don't bother adding
max_n = 0
for word in list(set(list_in)): #get unique values to speed up
for i in range(len(word)):
# Look up the whole word, then one less letter, sequentially
curr_key = word[0:len(word)-i]
# if not in, count occurance
if curr_key not in keys.keys() and curr_key!='':
n = 0
for word2 in list_in:
if curr_key in word2:
n+=1
# if large n, Add to dictionary
if n > max_n:
max_n = n
keys[curr_key] = n
# Finish the word
# Finish for loop
if max_key_only:
return max(keys, key=keys.get)
else:
return keys
# Create your "from list"
From_List = [
"myKey_apples",
"myKey_appleses",
"myKey_oranges",
"foo",
"myKey_Banannas"
]
# Use the function
key = find_matching_key(From_List, True)
# Iterate over your list, replacing values
new_From_List = [x.replace(key,'') for x in From_List]
print(new_From_List)
['apples', 'appleses', 'oranges', 'foo', 'Banannas']
Needless to say, this solution would look a lot neater with recursion. Thought I'd sketch out a rough dynamic programming solution for you though.
I would first find the starting letter with the most occurrences. Then I would take each word having that starting letter, and take while all these words have matching letters. Then in the end I would remove the prefix that was found from each starting word:
from collections import Counter
from itertools import takewhile
strings = ["myKey_apples", "myKey_appleses", "myKey_oranges", "berries"]
def remove_mc_prefix(words):
cnt = Counter()
for word in words:
cnt[word[0]] += 1
first_letter = list(cnt)[0]
filter_list = [word for word in words if word[0] == first_letter]
filter_list.sort(key = lambda s: len(s)) # To avoid iob
prefix = ""
length = len(filter_list[0])
for i in range(length):
test = filter_list[0][i]
if all([word[i] == test for word in filter_list]):
prefix += test
else: break
return [word[len(prefix):] if word.startswith(prefix) else word for word in words]
print(remove_mc_prefix(strings))
Out: ['apples', 'appleses', 'oranges', 'berries']
To find the most-common-substring from list of python-string
I already tested on python-3.10.5 I hope it will work for you.
I have the same use case but a different kind of task, I just need to find one common-pattern-string from a list of more than 100s files. To use as a regular-expression.
Your Basic example code is not working in my case. because 1st checking with 2nd, 2nd with 3rd, 3rd with 4th and so on. So, I change it to the most common substring and will check with each one.
The downside of this code is that if something is not common with the most common substring, the final most common substring will be an empty one.
But in my case, it is working.
from difflib import SequenceMatcher
for i in range(1, len(names)):
if i==1:
string1, string2 = names[0], names[i]
else:
string1, string2 = most_common_substring, names[i]
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
most_common_substring = string1[match.a: match.a + match.size]
print(f"most_common_substring : {most_common_substring}")
python python-3python-difflib
I'm having trouble with a script to replace the normal letters to especial characters to test a translation system, here's an example (cha-mate is chá-mate but would be tested with chã-mate/chã-máte and other variations), but instead of creating this variations, it's switching all of the same characters to only one espcial letter, here's what it's printing:
chá-máte
chã-mãte
Here's what should print in theory:
cha-máte
cha-mãte
chá-mate
chã-mate
etc.
Here's the code and the json utilized:
def translation_tester(word):
esp_chars = {
'a': 'áã',
}
#words = [word]
for esp_char in esp_chars:
if esp_char in word:
replacement_chars = esp_chars[esp_char]
for i in range(len(replacement_chars)):
print(word.replace(esp_char, replacement_chars[i]))
def main():
words = ['cha-mate']
for word in words:
translation_tester(word)
main()
Anyway, any help is appreciated, thanks in advance!
To handle arbitrary number of replacements, you need to use recursion. This is how I did it.
intword = 'cha-mate'
esp_chars = {'a': 'áã'}
def wpermute(word, i=0):
for idx, c in enumerate(word[i:], i):
if c in esp_chars:
for s in esp_chars[c]:
newword = word[0:idx] + s + word[idx + 1:]
wpermute(newword, idx + 1)
if idx == len(word) -1:
print(word)
wpermute(intword)
which gives the output of 9 different ways the word can be written.
chá-máte
chá-mãte
chá-mate
chã-máte
chã-mãte
chã-mate
cha-máte
cha-mãte
cha-mate
There might be a nicer way to do this, but you can do the following (making sure to include the plain 'a' in the list of replacement chars):
import itertools
import re
def replace_at_indices(word, new_chars, indices):
new_word = word
for i, index in enumerate(indices):
new_word = new_word[:index] + new_chars[i] + new_word[index+1:]
return new_word
def translation_tester(word):
esp_chars = {
'a': 'aáã',
}
for esp_char in esp_chars:
replacement_chars = list(esp_chars[esp_char])
indices = [m.start() for m in re.finditer(esp_char, word)]
product = list(itertools.product(replacement_chars, repeat=len(indices)))
for p in product:
new_word = replace_at_indices(word, p, indices)
print(new_word)
def main():
words = ['cha-mate']
for word in words:
translation_tester(word)
main()
For your example, this should give you:
cha-mate
cha-máte
cha-mãte
chá-mate
chá-máte
chá-mãte
chã-mate
chã-máte
chã-mãte
See also:
Find all occurrences of a substring in Python
generating permutations with repetitions in python
Replacing a character from a certain index
Very new to Python programming. How I display 2 words before and after a key search word. In below example I am looking for a search word = lists
Sample:
Line 1: List of the keyboard shortcuts for Word 2000
Line 2: Sequences: strings, lists, and tuples - PythonLearn
Desired results (Lists word only found only in line 2)
Line 2: Sequences: strings, lists, and tuples
Thanks for your help in this.
This solution is based on Avinash Raj's second example with these amendments:
Allows the number of words to be printed each side of the search word to be varied
Uses a list comprehension instead of if inside for, which may be considered more 'Pythonic', though I'm not sure in this case if it's more readable.
.
s = """List of the keyboard shortcuts for Word 2000
Sequences: strings, lists and tuples - PythonLearn"""
findword = 'lists'
numwords = 2
for i in s.split('\n'):
z = i.split(' ')
for x in [x for (x, y) in enumerate(z) if findword in y]:
print(' '.join(z[max(x-numwords,0):x+numwords+1]))
Through re.findall function.
>>> s = """List of the keyboard shortcuts for Word 2000
Sequences: strings, lists, and tuples - PythonLearn"""
>>> re.findall(r'\S+ \S+ \S*\blists\S* \S+ \S+', s)
['Sequences: strings, lists, and tuples']
Without regex.
>>> s = """List of the keyboard shortcuts for Word 2000
Sequences: strings, lists, and tuples - PythonLearn"""
>>> for i in s.split('\n'):
z = i.split()
for x,y in enumerate(z):
if 'lists' in y:
print(z[x-2]+' '+z[x-1]+' '+z[x]+' '+z[x+1]+' '+z[x+2])
Sequences: strings, lists, and tuples
This is the solution I can think of right away for your question :-)
def get_word_list(line, keyword, length, splitter):
word_list = line.split(keyword)
if len(word_list) == 1:
return []
search_result = []
temp_result = ""
index = 0
while index < len(word_list):
result = word_list[index].strip().split(splitter, length-1)[-1]
result += " " + keyword
if index+1 > len(word_list):
search_result.append(result.strip())
break
right_string = word_list[index+1].lstrip(" ").split(splitter, length+1)[:length]
print word_list[index+1].lstrip(), right_string
result += " " + " ".join(right_string)
search_result.append(result.strip())
index += 2
return search_result
def search(file, keyword, length=2, splitter= " "):
search_results = []
with open(file, "r") as fo:
for line in fo:
line = line.strip()
search_results += get_word_list(line, keyword, length, splitter)
for result in search_results:
print "Result:", result