Divide too long text in similar chunks considering punctuation - python

I have a list of strings that have to be not more than X characters. Each string can contain many sentences (separated by punctuation like dots). I need to separate longer sentences than X characters with this logic:
I have to divide them into the minimum number of parts (starting from 2), in order to have all the chunks with a lower length than X as similar as possible (possibly identical), but considering the punctuation (example: if I have Hello. How are you?, I can't divide it into Hello. Ho and w are you? but in Hello. and How are you? because it's the most similar way to divide it into two equal parts, without loosing the sense of the sentences)
max = 10
strings = ["Hello. How are you? I'm fine", "other string containg dots", "another string containg dots"]
for string in string:
if len(string) > max:
#algorithm to chunck it
In this case, I will have to divide the first string Hello. How are you? I'm fine into 3 parts because with 2 parts, I'll have one of the 2 chunks longer than 10 characters (max).
Is there a smart existing solution? Or does anyone know how to do that?

An example function for chunking string (within the character minimum and maximum lengths) by punctuation (e.g. ".", ",", ";", "?"); in other words, prioritizing punctuation over character length:
import numpy as np
def chunkingStringFunction(strings, charactersDefiningChunking = [".", ",", ";", "?"], numberOfMaximumCharactersPerChunk = None, numberOfMinimumCharactersPerChunk = None, **kwargs):
if numberOfMaximumCharactersPerChunk is None:
numberOfMaximumCharactersPerChunk = 100
if numberOfMinimumCharactersPerChunk is None:
numberOfMinimumCharactersPerChunk = 2
storingChunksOfString = []
for string in strings:
chunkingStartingAtThisIndex = 0
indexingCharactersInStrings = 0
while indexingCharactersInStrings < len(string) - 1:
indexingCharactersInStrings += 1
currentChunk = string[chunkingStartingAtThisIndex:indexingCharactersInStrings + 1]
if len(currentChunk) >= numberOfMinimumCharactersPerChunk and len(currentChunk) <= numberOfMaximumCharactersPerChunk:
indexesForStops = []
for indexingCharacterDefiningChunking in range(len(charactersDefiningChunking)):
indexesForStops.append(currentChunk.find(charactersDefiningChunking[indexingCharacterDefiningChunking]) + chunkingStartingAtThisIndex)
indexesForStops = np.max(indexesForStops, axis = None)
addChunk = string[chunkingStartingAtThisIndex:indexesForStops + 1]
if len(addChunk) > 1 and addChunk != " ":
storingChunksOfString.append(addChunk)
chunkingStartingAtThisIndex = indexesForStops + 1
indexingCharactersInStrings = chunkingStartingAtThisIndex
return storingChunksOfString
Alternatively, to prioritize character length; as in, if we want to consider our (average) character length and from there, find out where our defined characters for chunking are:
import numpy as np
def chunkingStringFunction(strings, charactersDefiningChunking = [".", ",", ";", "?"], averageNumberOfCharactersPerChunk = None, **kwargs):
if averageNumberOfCharactersPerChunk is None:
averageNumberOfCharactersPerChunk = 10
storingChunksOfString = []
for string in strings:
lastIndexChunked = 0
for indexingCharactersInString in range(1, len(string), 1):
chunkStopsAtADefinedCharacter = False
if indexingCharactersInString - lastIndexChunked == averageNumberOfCharactersPerChunk:
indexingNumberOfCharactersAwayFromAverageChunk = 1
while chunkStopsAtADefinedCharacter == False:
indexingNumberOfCharactersAwayFromAverageChunk += 1
for thisCharacter in charactersDefiningChunking:
findingAChunkCharacter = string[indexingCharactersInString - indexingNumberOfCharactersAwayFromAverageChunk:indexingCharactersInString + (indexingNumberOfCharactersAwayFromAverageChunk + 1)].find(thisCharacter)
if findingAChunkCharacter > -1 and len(string[lastIndexChunked:indexingCharactersInString - indexingNumberOfCharactersAwayFromAverageChunk + findingAChunkCharacter + 1]) != 0:
storingChunksOfString.append(string[lastIndexChunked:indexingCharactersInString - indexingNumberOfCharactersAwayFromAverageChunk + findingAChunkCharacter + 1])
lastIndexChunked = indexingCharactersInString - indexingNumberOfCharactersAwayFromAverageChunk + findingAChunkCharacter + 1
chunkStopsAtADefinedCharacter = True
elif indexingCharactersInString == len(string) - 1 and lastIndexChunked != len(string) - 1 and len(string[lastIndexChunked:indexingCharactersInString + 1]) != 0:
storingChunksOfString.append(string[lastIndexChunked:indexingCharactersInString + 1])
return storingChunksOfString

Related

generate a sequence with respect to subsequences in python

I try to generate the following sequences.
text = ACCCEBCE
target = 000000D0
a random text of different characters is generated. In the text sequence, if the following subsequences are found, the target is going to be D or E. Otherwise, the target will be 0.
ABC --> D
BCD --> E
I write the following code. It works well if I generate a small number of characters. But it does not give any output if I make timesteps = 1000 etc.
import string
import random as rn
import numpy as np
def is_subseq(x, y):
it = iter(y)
return all(any(c == ch for c in it) for ch in x)
def count(a, b, m, n):
# If both first and second string
# is empty, or if second string
# is empty, return 1
if ((m == 0 and n == 0) or n == 0):
return 1
# If only first string is empty
# and second string is not empty,
# return 0
if (m == 0):
return 0
# If last characters are same
# Recur for remaining strings by
# 1. considering last characters
# of both strings
# 2. ignoring last character
# of first string
if (a[m - 1] == b[n - 1]):
return (count(a, b, m - 1, n - 1) +
count(a, b, m - 1, n))
else:
# If last characters are different,
# ignore last char of first string
# and recur for remaining string
return count(a, b, m - 1, n)
# create a sequence classification instance
def get_sequence(n_timesteps):
alphabet="ABCDE"#string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=n_timesteps))
print(text)
seq_length=3
subseqX = []
subseqY = []
for i in range(0, len(alphabet) - seq_length, 1):
seq_in = alphabet[i:i + seq_length]
seq_out = alphabet[i + seq_length]
subseqX.append([char for char in seq_in])
subseqY.append(seq_out)
print(seq_in, "\t-->\t",seq_out)
y2 = []
match = 0
countlist=np.zeros(len(subseqX))
for i, val in enumerate(text):
found = False
counter = 0
for g, val2 in enumerate(subseqX):
listToStr = ''.join(map(str, subseqX[g]))
howmany = count(text[:i], listToStr, len(text[:i]),len(listToStr))
if is_subseq(listToStr, text[:i]):
if countlist[g] < howmany:
match = match + howmany
countlist[g] = howmany
temp = g
found = True
if found:
y2.append(subseqY[temp])
else:
y2.append(0)
print("counter:\t", counter)
print(text)
print(y2)
# define problem properties
n_timesteps = 100
get_sequence(n_timesteps)
It might be because of the depth of the recursive function. But I need to generate 1000 or 10000 characters.
How can I fix this problem? Any ideas?
I'm not sure I understand all you're trying to do (lots of code there), but I believe this simplified form of the function should work. It maintains a set of subsequences seen so far. It only extends them by adding the next letter when it is encountered. This allows the flagging to know if the prefix to the sequence up to the current character has been seen before.
def flagSequence(S,letters="ABCDE",seqLen=3):
subSeqs = set()
result = "0"
for c in S[:-1]:
p = letters.index(c)
subSeqs.add(c)
if p>0:
subSeqs.update([s+c for s in subSeqs if s[-1]==letters[p-1]])
if p in range(seqLen-1,len(letters)-1) and letters[p-seqLen+1:p+1] in subSeqs:
result += letters[p+1]
else:
result += "0"
return result
output:
text = "BDBACCBECEECAEAEDCAACBCCDDDBBDEEDABDBDE"
print(text)
print(flagSequence(text))
BDBACCBECEECAEAEDCAACBCCDDDBBDEEDABDBDE
000000000D00D0000ED00D0DDEEE00E00E00E0E
with more letters:
alphabet=string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=10000))
flags = flagSequence(text,alphabet)
print(text[:60])
print(flags[:60])
CHUJKAMWCAAIBXGIZFHALAWWFDDELXREMOQQVXFPNYJRQESRVEJKIAQILYSJ...
000000000000000000000M000000FM00FN00000G0OZK0RFTS0FKLJ0RJMZT...
with longer sequences:
alphabet=string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=10000))
flags = flagSequence(text,alphabet,seqLen=10)
print(text[200:260])
print(flags[200:260])
...PMZCDQXAOHVMTRLYCNCJABGGNZYAWIHJJCQKMMAENQFHNQTOQOPPGHVQZXZU...
...00N0000Y000WN000Z0O0K0000O0Z0X00KK00LNN00O000O00P0PQQ00WR0Y0...

Replace all occurrences of the substring in string using string slicing

I want to replace all substring occurrences in a string, but I wish not to use the replace method. At the moment, experiments have led me to this:
def count_substrings_and_replace(string, substring, rpl=None):
string_size = len(string)
substring_size = len(substring)
count = 0
_o = string
for i in range(0, string_size - substring_size + 1):
if string[i:i + substring_size] == substring:
if rpl:
print(_o[:i] + rpl + _o[i + substring_size:])
count += 1
return count, _o
count_substrings_and_replace("aaabaaa", "aaa", "ddd")
but I have output like this:
dddbaaa
aaabddd
not dddbddd.
Update 1:
I figured out that I can only replace correctly with a string of the same length of substring. For example for count_substrings_and_replace("aaabaaa", "aaa", "d") I got output: (2, 'dbaad') not dbd
Update 2:
Issue described in update 1 did appear because of string comparing relative to the original string (line 8) that does not change throughout the process.
Fixed:
def count_substrings_and_replace(string, substring, rpl=None):
string_size = len(string)
substring_size = len(substring)
count = 0
_o = string
for i in range(0, string_size - substring_size + 1):
if _o[i:i + substring_size] == substring:
if rpl:
_o = _o[:i] + rpl + _o[i + substring_size:]
count += 1
return count, _o
count_substrings_and_replace("aaabaaa", "aaa", "d")
Output: (2, dbd)
You never update the value of _o when a match is found, you're only printing out what it'd look like if it was to be replaced. Instead, inside that innermost if statement should be two lines like:
_o = _o[:i] + rpl + _o[i + substring_size:]
print(_o)
That would print the string every time a match is found and replaced, moving the print statement to run after the for loop would make it only run once the entire string was parsed and replaced appropriately.
Just my mistake. I had to pass the value to the variable on each iteration not print:
_o = _o[:i] + rpl + _o[i + substring_size:]

How to find the most amount of shared characters in two strings? (Python)

yamxxopd
yndfyamxx
Output: 5
I am not quite sure how to find the number of the most amount of shared characters between two strings. For example (the strings above) the most amount of characters shared together is "yamxx" which is 5 characters long.
xx would not be a solution because that is not the most amount of shared characters. In this case the most is yamxx which is 5 characters long so the output would be 5.
I am quite new to python and stack overflow so any help would be much appreciated!
Note: They should be the same order in both strings
Here is simple, efficient solution using dynamic programming.
def longest_subtring(X, Y):
m,n = len(X), len(Y)
LCSuff = [[0 for k in range(n+1)] for l in range(m+1)]
result = 0
for i in range(m + 1):
for j in range(n + 1):
if (i == 0 or j == 0):
LCSuff[i][j] = 0
elif (X[i-1] == Y[j-1]):
LCSuff[i][j] = LCSuff[i-1][j-1] + 1
result = max(result, LCSuff[i][j])
else:
LCSuff[i][j] = 0
print (result )
longest_subtring("abcd", "arcd") # prints 2
longest_subtring("yammxdj", "nhjdyammx") # prints 5
This solution starts with sub-strings of longest possible lengths. If, for a certain length, there are no matching sub-strings of that length, it moves on to the next lower length. This way, it can stop at the first successful match.
s_1 = "yamxxopd"
s_2 = "yndfyamxx"
l_1, l_2 = len(s_1), len(s_2)
found = False
sub_length = l_1 # Let's start with the longest possible sub-string
while (not found) and sub_length: # Loop, over decreasing lengths of sub-string
for start in range(l_1 - sub_length + 1): # Loop, over all start-positions of sub-string
sub_str = s_1[start:(start+sub_length)] # Get the sub-string at that start-position
if sub_str in s_2: # If found a match for the sub-string, in s_2
found = True # Stop trying with smaller lengths of sub-string
break # Stop trying with this length of sub-string
else: # If no matches found for this length of sub-string
sub_length -= 1 # Let's try a smaller length for the sub-strings
print (f"Answer is {sub_length}" if found else "No common sub-string")
Output:
Answer is 5
s1 = "yamxxopd"
s2 = "yndfyamxx"
# initializing counter
counter = 0
# creating and initializing a string without repetition
s = ""
for x in s1:
if x not in s:
s = s + x
for x in s:
if x in s2:
counter = counter + 1
# display the number of the most amount of shared characters in two strings s1 and s2
print(counter) # display 5

How to split text after a certain number of non-space and non-paragraph characters?

I would like to split up a text after a certain number of non-space and non-paragraph characters.
So far, I know that you can do this to split up a string after a total number of characters
cutOff = 10
splitString = oldString[0:cutOff]
But how do I do this so that it does not factor spaces in the character count?
You can use a regular expression. This returns a two-element tuple (list) containing the two halves of the input string broken at the desired location:
import re
data = """Now is the time
for all good men
to come"""
def break_at_ignoring_whitespace(str, break_at):
m = re.match(r"((\s*\w){%d})(.*)" % break_at, str, re.S)
return (m.group(1), m.group(3)) if m else (str, '')
r = break_at_ignoring_whitespace(data, 14)
print(">>" + r[0] + "<<")
print(">>" + r[1] + "<<")
Result:
>>Now is the time
fo<<
>>r all good men
to come<<
You can do a while loop.
oldString = "Hello world"
cutOff = 10
i = 0
while i < cutOff and cutOff < len(oldString):
if oldString[i] in [' ', '\n']: cutOff += 1
i += 1
splitString = oldString[:cutOff]

Python: replace string, matched from a list

Trying to match and mark character based n-grams. The string
txt = "how does this work"
is to be matched with n-grams from the list
ngrams = ["ow ", "his", "s w"]
and marked with <> – however, only if there is no preceding opened quote. The output i am seeking for this string is h<ow >does t<his w>ork (notice the double match in the 2-nd part, but within just 1 pair of expected quotes).
The for loop i’ve tried for this doesn’t, however, produce the wanted output at all:
switch = False
for i in txt:
if i in "".join(ngrams) and switch == False:
txt = txt.replace(i, "<" + i)
switch = True
if i not in "".join(ngrams) and switch == True:
txt = txt.replace(i, ">" + i)
switch = False
print(txt)
Any help would be greatly appreciated.
This solution uses the str.find method to find all copies of an ngram within the txt string, saving the indices of each copy to the indices set so we can easily handle overlapping matches.
We then copy txt, char by char to the result list, inserting angle brackets where required. This strategy is more efficient than inserting the angle brackets using multiple .replace call because each .replace call needs to rebuild the whole string.
I've extended your data slightly to illustrate that my code handles multiple copies of an ngram.
txt = "how does this work now chisolm"
ngrams = ["ow ", "his", "s w"]
print(txt)
print(ngrams)
# Search for all copies of each ngram in txt
# saving the indices where the ngrams occur
indices = set()
for s in ngrams:
slen = len(s)
lo = 0
while True:
i = txt.find(s, lo)
if i == -1:
break
lo = i + slen
print(s, i)
indices.update(range(i, lo-1))
print(indices)
# Copy the txt to result, inserting angle brackets
# to show matches
switch = True
result = []
for i, u in enumerate(txt):
if switch:
if i in indices:
result.append('<')
switch = False
result.append(u)
else:
result.append(u)
if i not in indices:
result.append('>')
switch = True
print(''.join(result))
output
how does this work now chisolm
['ow ', 'his', 's w']
ow 1
ow 20
his 10
his 24
s w 12
{1, 2, 10, 11, 12, 13, 20, 21, 24, 25}
h<ow >does t<his w>ork n<ow >c<his>olm
If you want adjacent groups to be merged, we can easily do that using the str.replace method. But to make that work properly we need to pre-process the original data, converting all runs of whitespace to single spaces. A simple way to do that is to split the data and re-join it.
txt = "how does this\nwork now chisolm hisow"
ngrams = ["ow", "his", "work"]
#Convert all whitespace to single spaces
txt = ' '.join(txt.split())
print(txt)
print(ngrams)
# Search for all copies of each ngram in txt
# saving the indices where the ngrams occur
indices = set()
for s in ngrams:
slen = len(s)
lo = 0
while True:
i = txt.find(s, lo)
if i == -1:
break
lo = i + slen
print(s, i)
indices.update(range(i, lo-1))
print(indices)
# Copy the txt to result, inserting angle brackets
# to show matches
switch = True
result = []
for i, u in enumerate(txt):
if switch:
if i in indices:
result.append('<')
switch = False
result.append(u)
else:
result.append(u)
if i not in indices:
result.append('>')
switch = True
# Convert the list to a single string
output = ''.join(result)
# Merge adjacent groups
output = output.replace('> <', ' ').replace('><', '')
print(output)
output
how does this work now chisolm hisow
['ow', 'his', 'work']
ow 1
ow 20
ow 34
his 10
his 24
his 31
work 14
{32, 1, 34, 10, 11, 14, 15, 16, 20, 24, 25, 31}
h<ow> does t<his work> n<ow> c<his>olm <hisow>
This should work:
txt = "how does this work"
ngrams = ["ow ", "his", "s w"]
# first find where letters match ngrams
L = len(txt)
match = [False]*L
for ng in ngrams:
l = len(ng)
for i in range(L-l):
if txt[i:i+l] == ng:
for j in range(l):
match[i+j] = True
# then sandwich matches with quotes
out = []
switch = False
for i in range(L):
if not switch and match[i]:
out.append('<')
switch = True
if switch and not match[i]:
out.append('>')
switch = False
out.append(txt[i])
print "".join(out)
Here's a method with only one for loop. I timed it and it's about as fast as the other answers to this question. I think it's a bit more clear, although that might be because I wrote it.
I iterate over the index of the first character in the n-gram, then if it matches, I use a bunch of if-else clauses to see whether I should add a < or > in this situation. I add to the end of the string output from the original txt, so I'm not really inserting in the middle of a string.
txt = "how does this work"
ngrams = set(["ow ", "his", "s w"])
n = 3
prev = -n
output = ''
shift = 0
open = False
for i in xrange(len(txt) - n + 1):
ngram = txt[i:i + n]
if ngram in ngrams:
if i - prev > n:
if open:
output += txt[prev:prev + n] + '>' + txt[prev + n:i] + '<'
elif not open:
if prev > 0:
output += txt[prev + n:i] + '<'
else:
output += txt[:i] + '<'
open = True
else:
output += txt[prev:i]
prev = i
if open:
output += txt[prev:prev + n] + '>' + txt[prev + n:]
print output

Categories