Splitting text without losing separator [duplicate] - python

Here's the simplest way to explain this. Here's what I'm using:
re.split('\W', 'foo/bar spam\neggs')
>>> ['foo', 'bar', 'spam', 'eggs']
Here's what I want:
someMethod('\W', 'foo/bar spam\neggs')
>>> ['foo', '/', 'bar', ' ', 'spam', '\n', 'eggs']
The reason is that I want to split a string into tokens, manipulate it, then put it back together again.

The docs of re.split mention:
Split string by the occurrences of pattern. If capturing
parentheses are used in pattern, then the text of all groups in the
pattern are also returned as part of the resulting list.
So you just need to wrap your separator with a capturing group:
>>> re.split('(\W)', 'foo/bar spam\neggs')
['foo', '/', 'bar', ' ', 'spam', '\n', 'eggs']

If you are splitting on newline, use splitlines(True).
>>> 'line 1\nline 2\nline without newline'.splitlines(True)
['line 1\n', 'line 2\n', 'line without newline']
(Not a general solution, but adding this here in case someone comes here not realizing this method existed.)

another example, split on non alpha-numeric and keep the separators
import re
a = "foo,bar#candy*ice%cream"
re.split('([^a-zA-Z0-9])',a)
output:
['foo', ',', 'bar', '#', 'candy', '*', 'ice', '%', 'cream']
explanation
re.split('([^a-zA-Z0-9])',a)
() <- keep the separators
[] <- match everything in between
^a-zA-Z0-9 <-except alphabets, upper/lower and numbers.

If you have only 1 separator, you can employ list comprehensions:
text = 'foo,bar,baz,qux'
sep = ','
Appending/prepending separator:
result = [x+sep for x in text.split(sep)]
#['foo,', 'bar,', 'baz,', 'qux,']
# to get rid of trailing
result[-1] = result[-1].strip(sep)
#['foo,', 'bar,', 'baz,', 'qux']
result = [sep+x for x in text.split(sep)]
#[',foo', ',bar', ',baz', ',qux']
# to get rid of trailing
result[0] = result[0].strip(sep)
#['foo', ',bar', ',baz', ',qux']
Separator as it's own element:
result = [u for x in text.split(sep) for u in (x, sep)]
#['foo', ',', 'bar', ',', 'baz', ',', 'qux', ',']
results = result[:-1] # to get rid of trailing

Another no-regex solution that works well on Python 3
# Split strings and keep separator
test_strings = ['<Hello>', 'Hi', '<Hi> <Planet>', '<', '']
def split_and_keep(s, sep):
if not s: return [''] # consistent with string.split()
# Find replacement character that is not used in string
# i.e. just use the highest available character plus one
# Note: This fails if ord(max(s)) = 0x10FFFF (ValueError)
p=chr(ord(max(s))+1)
return s.replace(sep, sep+p).split(p)
for s in test_strings:
print(split_and_keep(s, '<'))
# If the unicode limit is reached it will fail explicitly
unicode_max_char = chr(1114111)
ridiculous_string = '<Hello>'+unicode_max_char+'<World>'
print(split_and_keep(ridiculous_string, '<'))

One Lazy and Simple Solution
Assume your regex pattern is split_pattern = r'(!|\?)'
First, you add some same character as the new separator, like '[cut]'
new_string = re.sub(split_pattern, '\\1[cut]', your_string)
Then you split the new separator, new_string.split('[cut]')

You can also split a string with an array of strings instead of a regular expression, like this:
def tokenizeString(aString, separators):
#separators is an array of strings that are being used to split the string.
#sort separators in order of descending length
separators.sort(key=len)
listToReturn = []
i = 0
while i < len(aString):
theSeparator = ""
for current in separators:
if current == aString[i:i+len(current)]:
theSeparator = current
if theSeparator != "":
listToReturn += [theSeparator]
i = i + len(theSeparator)
else:
if listToReturn == []:
listToReturn = [""]
if(listToReturn[-1] in separators):
listToReturn += [""]
listToReturn[-1] += aString[i]
i += 1
return listToReturn
print(tokenizeString(aString = "\"\"\"hi\"\"\" hello + world += (1*2+3/5) '''hi'''", separators = ["'''", '+=', '+', "/", "*", "\\'", '\\"', "-=", "-", " ", '"""', "(", ")"]))

Here is a simple .split solution that works without regex.
This is an answer for Python split() without removing the delimiter, so not exactly what the original post asks but the other question was closed as a duplicate for this one.
def splitkeep(s, delimiter):
split = s.split(delimiter)
return [substr + delimiter for substr in split[:-1]] + [split[-1]]
Random tests:
import random
CHARS = [".", "a", "b", "c"]
assert splitkeep("", "X") == [""] # 0 length test
for delimiter in ('.', '..'):
for _ in range(100000):
length = random.randint(1, 50)
s = "".join(random.choice(CHARS) for _ in range(length))
assert "".join(splitkeep(s, delimiter)) == s

# This keeps all separators in result
##########################################################################
import re
st="%%(c+dd+e+f-1523)%%7"
sh=re.compile('[\+\-//\*\<\>\%\(\)]')
def splitStringFull(sh, st):
ls=sh.split(st)
lo=[]
start=0
for l in ls:
if not l : continue
k=st.find(l)
llen=len(l)
if k> start:
tmp= st[start:k]
lo.append(tmp)
lo.append(l)
start = k + llen
else:
lo.append(l)
start =llen
return lo
#############################
li= splitStringFull(sh , st)
['%%(', 'c', '+', 'dd', '+', 'e', '+', 'f', '-', '1523', ')%%', '7']

replace all seperator: (\W) with seperator + new_seperator: (\W;)
split by the new_seperator: (;)
def split_and_keep(seperator, s):
return re.split(';', re.sub(seperator, lambda match: match.group() + ';', s))
print('\W', 'foo/bar spam\neggs')

If one wants to split string while keeping separators by regex without capturing group:
def finditer_with_separators(regex, s):
matches = []
prev_end = 0
for match in regex.finditer(s):
match_start = match.start()
if (prev_end != 0 or match_start > 0) and match_start != prev_end:
matches.append(s[prev_end:match.start()])
matches.append(match.group())
prev_end = match.end()
if prev_end < len(s):
matches.append(s[prev_end:])
return matches
regex = re.compile(r"[\(\)]")
matches = finditer_with_separators(regex, s)
If one assumes that regex is wrapped up into capturing group:
def split_with_separators(regex, s):
matches = list(filter(None, regex.split(s)))
return matches
regex = re.compile(r"([\(\)])")
matches = split_with_separators(regex, s)
Both ways also will remove empty groups which are useless and annoying in most of the cases.

install wrs "WITHOUT REMOVING SPLITOR" BY DOING
pip install wrs
(developed by Rao Hamza)
import wrs
text = "Now inbox “how to make spam ad” Invest in hard email marketing."
splitor = 'email | spam | inbox'
list = wrs.wr_split(splitor, text)
print(list)
result:
['now ', 'inbox “how to make ', 'spam ad” invest in hard ', 'email marketing.']

I had a similar issue trying to split a file path and struggled to find a simple answer.
This worked for me and didn't involve having to substitute delimiters back into the split text:
my_path = 'folder1/folder2/folder3/file1'
import re
re.findall('[^/]+/|[^/]+', my_path)
returns:
['folder1/', 'folder2/', 'folder3/', 'file1']

I found this generator based approach more satisfying:
def split_keep(string, sep):
"""Usage:
>>> list(split_keep("a.b.c.d", "."))
['a.', 'b.', 'c.', 'd']
"""
start = 0
while True:
end = string.find(sep, start) + 1
if end == 0:
break
yield string[start:end]
start = end
yield string[start:]
It avoids the need to figure out the correct regex, while in theory should be fairly cheap. It doesn't create new string objects and, delegates most of the iteration work to the efficient find method.
... and in Python 3.8 it can be as short as:
def split_keep(string, sep):
start = 0
while (end := string.find(sep, start) + 1) > 0:
yield string[start:end]
start = end
yield string[start:]

May I just leave it here
s = 'foo/bar spam\neggs'
print(s.replace('/', '+++/+++').replace(' ', '+++ +++').replace('\n', '+++\n+++').split('+++'))
['foo', '/', 'bar', ' ', 'spam', '\n', 'eggs']

Use re.split and also your regular expression comes from variable and also you have multi separator ,you can use as the following:
# BashSpecialParamList is the special param in bash,
# such as your separator is the bash special param
BashSpecialParamList = ["$*", "$#", "$#", "$?", "$-", "$$", "$!", "$0"]
# aStr is the the string to be splited
aStr = "$a Klkjfd$0 $? $#%$*Sdfdf"
reStr = "|".join([re.escape(sepStr) for sepStr in BashSpecialParamList])
re.split(f'({reStr})', aStr)
# Then You can get the result:
# ['$a Klkjfd', '$0', ' ', '$?', ' ', '$#', '%', '$*', 'Sdfdf']
reference: GNU Bash Special Parameters

Some of those answers posted before, will repeat delimiter, or have some other bugs which I faced in my case. You can use this function, instead:
def split_and_keep_delimiter(input, delimiter):
result = list()
idx = 0
while delimiter in input:
idx = input.index(delimiter);
result.append(input[0:idx+len(delimiter)])
input = input[idx+len(delimiter):]
result.append(input)
return result

In the below code, there is a simple, very efficient and well tested answer to this question. The code has comments explaining everything in it.
I promise it's not as scary as it looks - it's actually only 13 lines of code! The rest are all comments, docs and assertions
def split_including_delimiters(input: str, delimiter: str):
"""
Splits an input string, while including the delimiters in the output
Unlike str.split, we can use an empty string as a delimiter
Unlike str.split, the output will not have any extra empty strings
Conequently, len(''.split(delimiter))== 0 for all delimiters,
whereas len(input.split(delimiter))>0 for all inputs and delimiters
INPUTS:
input: Can be any string
delimiter: Can be any string
EXAMPLES:
>>> split_and_keep_delimiter('Hello World ! ',' ')
ans = ['Hello ', 'World ', ' ', '! ', ' ']
>>> split_and_keep_delimiter("Hello**World**!***", "**")
ans = ['Hello', '**', 'World', '**', '!', '**', '*']
EXAMPLES:
assert split_and_keep_delimiter('-xx-xx-','xx') == ['-', 'xx', '-', 'xx', '-'] # length 5
assert split_and_keep_delimiter('xx-xx-' ,'xx') == ['xx', '-', 'xx', '-'] # length 4
assert split_and_keep_delimiter('-xx-xx' ,'xx') == ['-', 'xx', '-', 'xx'] # length 4
assert split_and_keep_delimiter('xx-xx' ,'xx') == ['xx', '-', 'xx'] # length 3
assert split_and_keep_delimiter('xxxx' ,'xx') == ['xx', 'xx'] # length 2
assert split_and_keep_delimiter('xxx' ,'xx') == ['xx', 'x'] # length 2
assert split_and_keep_delimiter('x' ,'xx') == ['x'] # length 1
assert split_and_keep_delimiter('' ,'xx') == [] # length 0
assert split_and_keep_delimiter('aaa' ,'xx') == ['aaa'] # length 1
assert split_and_keep_delimiter('aa' ,'xx') == ['aa'] # length 1
assert split_and_keep_delimiter('a' ,'xx') == ['a'] # length 1
assert split_and_keep_delimiter('' ,'' ) == [] # length 0
assert split_and_keep_delimiter('a' ,'' ) == ['a'] # length 1
assert split_and_keep_delimiter('aa' ,'' ) == ['a', '', 'a'] # length 3
assert split_and_keep_delimiter('aaa' ,'' ) == ['a', '', 'a', '', 'a'] # length 5
"""
# Input assertions
assert isinstance(input,str), "input must be a string"
assert isinstance(delimiter,str), "delimiter must be a string"
if delimiter:
# These tokens do not include the delimiter, but are computed quickly
tokens = input.split(delimiter)
else:
# Edge case: if the delimiter is the empty string, split between the characters
tokens = list(input)
# The following assertions are always true for any string input and delimiter
# For speed's sake, we disable this assertion
# assert delimiter.join(tokens) == input
output = tokens[:1]
for token in tokens[1:]:
output.append(delimiter)
if token:
output.append(token)
# Don't let the first element be an empty string
if output[:1]==['']:
del output[0]
# The only case where we should have an empty string in the output is if it is our delimiter
# For speed's sake, we disable this assertion
# assert delimiter=='' or '' not in output
# The resulting strings should be combinable back into the original string
# For speed's sake, we disable this assertion
# assert ''.join(output) == input
return output

>>> line = 'hello_toto_is_there'
>>> sep = '_'
>>> [sep + x[1] if x[0] != 0 else x[1] for x in enumerate(line.split(sep))]
['hello', '_toto', '_is', '_there']

Related

Remove multiple occurrence in a row from a certain character in a string

I want to remove multiple '.' in a row to a single '.' in Python.
This is my code, that i come up with:
# remove multiple occurrences of '.'
string = "FOO...BAR......FOO..BAR.FOO"
last_char = None
new_string = ""
for char in string:
if not (last_char == '.' and char == '.'):
new_string += char
last_char = char
string = new_string
print(string)
and it is indeed doing what i want it to do, but I think there has to be a more elegant way to do this.
>>> FOO.BAR.FOO.BAR.FOO
That's it easier to do with a regex (re module) with pattern "\.+" which means an
a dot \.
which repeats between 1 to infinite amount of times +
import re
string = "FOO...BAR......FOO..BAR.FOO"
string = re.sub(r"\.+", '.', string)
print(string) # FOO.BAR.FOO.BAR.FOO
Or just replace all 2 dots, by one, until you have no more 2 dots together
string = "FOO...BAR......FOO..BAR.FOO"
while string.find('..') >= 0:
string = string.replace("..", ".")
print(string) # FOO.BAR.FOO.BAR.FOO
string = "FOO...BAR......FOO..BAR.FOO"
print (".".join([x for x in string.split(".") if x]))
Output:
FOO.BAR.FOO.BAR.FOO
You split at any given character, and join back to string if the list element is not null with the same given character as delimiter. So you replace all multiple occurrences of the given character by a single one.
Step by step:
string = "FOO...BAR......FOO..BAR.FOO"
print ([x for x in string.split(".")])
print ([x for x in string.split(".") if x])
print (".".join([x for x in string.split(".") if x]))
Output:
['FOO', '', '', 'BAR', '', '', '', '', '', 'FOO', '', 'BAR', 'FOO']
['FOO', 'BAR', 'FOO', 'BAR', 'FOO']
FOO.BAR.FOO.BAR.FOO

Splitting a string with numbers and letters [duplicate]

I'd like to split strings like these
'foofo21'
'bar432'
'foobar12345'
into
['foofo', '21']
['bar', '432']
['foobar', '12345']
Does somebody know an easy and simple way to do this in python?
I would approach this by using re.match in the following way:
import re
match = re.match(r"([a-z]+)([0-9]+)", 'foofo21', re.I)
if match:
items = match.groups()
print(items)
>> ("foofo", "21")
def mysplit(s):
head = s.rstrip('0123456789')
tail = s[len(head):]
return head, tail
>>> [mysplit(s) for s in ['foofo21', 'bar432', 'foobar12345']]
[('foofo', '21'), ('bar', '432'), ('foobar', '12345')]
Yet Another Option:
>>> [re.split(r'(\d+)', s) for s in ('foofo21', 'bar432', 'foobar12345')]
[['foofo', '21', ''], ['bar', '432', ''], ['foobar', '12345', '']]
>>> r = re.compile("([a-zA-Z]+)([0-9]+)")
>>> m = r.match("foobar12345")
>>> m.group(1)
'foobar'
>>> m.group(2)
'12345'
So, if you have a list of strings with that format:
import re
r = re.compile("([a-zA-Z]+)([0-9]+)")
strings = ['foofo21', 'bar432', 'foobar12345']
print [r.match(string).groups() for string in strings]
Output:
[('foofo', '21'), ('bar', '432'), ('foobar', '12345')]
I'm always the one to bring up findall() =)
>>> strings = ['foofo21', 'bar432', 'foobar12345']
>>> [re.findall(r'(\w+?)(\d+)', s)[0] for s in strings]
[('foofo', '21'), ('bar', '432'), ('foobar', '12345')]
Note that I'm using a simpler (less to type) regex than most of the previous answers.
here is a simple function to seperate multiple words and numbers from a string of any length, the re method only seperates first two words and numbers. I think this will help everyone else in the future,
def seperate_string_number(string):
previous_character = string[0]
groups = []
newword = string[0]
for x, i in enumerate(string[1:]):
if i.isalpha() and previous_character.isalpha():
newword += i
elif i.isnumeric() and previous_character.isnumeric():
newword += i
else:
groups.append(newword)
newword = i
previous_character = i
if x == len(string) - 2:
groups.append(newword)
newword = ''
return groups
print(seperate_string_number('10in20ft10400bg'))
# outputs : ['10', 'in', '20', 'ft', '10400', 'bg']
import re
s = raw_input()
m = re.match(r"([a-zA-Z]+)([0-9]+)",s)
print m.group(0)
print m.group(1)
print m.group(2)
without using regex, using isdigit() built-in function, only works if starting part is text and latter part is number
def text_num_split(item):
for index, letter in enumerate(item, 0):
if letter.isdigit():
return [item[:index],item[index:]]
print(text_num_split("foobar12345"))
OUTPUT :
['foobar', '12345']
This is a little longer, but more versatile for cases where there are multiple, randomly placed, numbers in the string. Also, it requires no imports.
def getNumbers( input ):
# Collect Info
compile = ""
complete = []
for letter in input:
# If compiled string
if compile:
# If compiled and letter are same type, append letter
if compile.isdigit() == letter.isdigit():
compile += letter
# If compiled and letter are different types, append compiled string, and begin with letter
else:
complete.append( compile )
compile = letter
# If no compiled string, begin with letter
else:
compile = letter
# Append leftover compiled string
if compile:
complete.append( compile )
# Return numbers only
numbers = [ word for word in complete if word.isdigit() ]
return numbers
Here is simple solution for that problem, no need for regex:
user = input('Input: ') # user = 'foobar12345'
int_list, str_list = [], []
for item in user:
try:
item = int(item) # searching for integers in your string
except:
str_list.append(item)
string = ''.join(str_list)
else: # if there are integers i will add it to int_list but as str, because join function only can work with str
int_list.append(str(item))
integer = int(''.join(int_list)) # if you want it to be string just do z = ''.join(int_list)
final = [string, integer] # you can also add it to dictionary d = {string: integer}
print(final)
In Addition to the answer of #Evan
If the incoming string is in this pattern 21foofo then the re.match pattern would be like this.
import re
match = re.match(r"([0-9]+)([a-z]+)", '21foofo', re.I)
if match:
items = match.groups()
print(items)
>> ("21", "foofo")
Otherwise, you'll get UnboundLocalError: local variable 'items' referenced before assignment error.

How do I split a string at a separator unless that separator is followed by a certain pattern?

I have a Python string
string = aaa1bbb1ccc1ddd
and I want to split it like this
re.split('[split at all occurrences of "1", unless the 1 is followed by a c]', string)
so that the result is
['aaa', 'bbb1ccc', 'ddd']
How do I do this?
Use negative-lookahead with regex and the re module:
>>> string = 'aaa1bbb1ccc1ddd'
>>> import re
>>> re.split(r"1(?!c)", string)
['aaa', 'bbb1ccc', 'ddd']
def split_by_delim_except(s, delim, bar):
escape = '\b'
find = delim + bar
return map(lambda s: s.replace(escape, find),
s.replace(find, escape).split(delim))
split_by_delim_except('aaa1bbb1ccc1ddd', '1', 'c')
Although not as pretty as regex, my following code returns the same result:
string = 'aaa1bbb1ccc1ddd'
Split the string at all instances of '1'
p1 = string.split('1')
Create a new empty list so we can append our desired items to
new_result = []
count = 0
for j in p1:
if j.startswith('c'):
# This removes the previous element from the list and stores it in a variable.
prev_element = new_result.pop(count-1)
prev_one_plus_j = prev_element + '1' + j
new_result.append(prev_one_plus_j)
else:
new_result.append(j)
count += 1
print (new_result)
Output:
['aaa', 'bbb1ccc', 'ddd']

How to prevent an procedure similar to the split () function (but with multiple separators) returns ' ' in its output

guys, I'm a programming newbie trying to improve the procedure bellow in a way that when I pass it this argument: split_string("After the flood ... all the colors came out."," .") it returns it:
['After', 'the', 'flood', 'all', 'the', 'colors', 'came', 'out']
and not this:
['After', 'the', 'flood', '', '', '', '', 'all', 'the', 'colors', 'came', 'out', '']
Any hint of how to do this? (I could just iterate again the list and delete the '' elements, but I wanted a more elegant solution)
This is the procedure:
def split_string(source, separatorList):
splited = [source]
for separator in splitlist:
source = splited
splited = []
print 'separator= ', separator
for sequence in source:
print 'sequence = ', sequence
if sequence not in splitlist and sequence != ' ':
splited = splited + sequence.split(separator)
return splited
print split_string("This is a test-of the,string separation-code!", " ,!-")
print
print split_string("After the flood ... all the colors came out."," .")
You can filter out the empty strings in the return statement:
return [x for x in split if x]
As a side note, I think it would be easier to write your function based on re.split():
def split_string(s, separators):
pattern = "|".join(re.escape(sep) for sep in separators)
return [x for x in re.split(pattern, s) if x]
print re.split('[. ]+', 'After the flood ... all the colors came out.')
or, better, the other way round
print re.findall('[^. ]+', 'After the flood ... all the colors came out.')
Let's see where did the empty strings come from first, try to execute this in shell:
>>> 'After the'.split(' ')
result:
['After', '', 'the']
This was because when split method came to ' ' in the string, it find nothing but '' between two spaces.
So the solution is simple, just check the boolean value of every item get from .split(
def split_string(source, separatorList):
splited = [source]
for separator in separatorList:
# if you want to exchange two variables, then write in one line can make the code more clear
source, splited = splited, []
for sequence in source:
# there's no need to check `sequence` in advance, just split it
# if sequence not in separatorList and sequence != ' ':
# splited = splited + sequence.split(separator)
# code to prevent appearance of `''` is here, do a if check in list comprehension.
# `+=` is equivalent to `= splited +`
splited += [i for i in sequence.split(separator) if i]
return splited
More details about [i for i in a_list if i] see PEP 202

In Python, how do I split a string and keep the separators?

Here's the simplest way to explain this. Here's what I'm using:
re.split('\W', 'foo/bar spam\neggs')
>>> ['foo', 'bar', 'spam', 'eggs']
Here's what I want:
someMethod('\W', 'foo/bar spam\neggs')
>>> ['foo', '/', 'bar', ' ', 'spam', '\n', 'eggs']
The reason is that I want to split a string into tokens, manipulate it, then put it back together again.
The docs of re.split mention:
Split string by the occurrences of pattern. If capturing
parentheses are used in pattern, then the text of all groups in the
pattern are also returned as part of the resulting list.
So you just need to wrap your separator with a capturing group:
>>> re.split('(\W)', 'foo/bar spam\neggs')
['foo', '/', 'bar', ' ', 'spam', '\n', 'eggs']
If you are splitting on newline, use splitlines(True).
>>> 'line 1\nline 2\nline without newline'.splitlines(True)
['line 1\n', 'line 2\n', 'line without newline']
(Not a general solution, but adding this here in case someone comes here not realizing this method existed.)
another example, split on non alpha-numeric and keep the separators
import re
a = "foo,bar#candy*ice%cream"
re.split('([^a-zA-Z0-9])',a)
output:
['foo', ',', 'bar', '#', 'candy', '*', 'ice', '%', 'cream']
explanation
re.split('([^a-zA-Z0-9])',a)
() <- keep the separators
[] <- match everything in between
^a-zA-Z0-9 <-except alphabets, upper/lower and numbers.
If you have only 1 separator, you can employ list comprehensions:
text = 'foo,bar,baz,qux'
sep = ','
Appending/prepending separator:
result = [x+sep for x in text.split(sep)]
#['foo,', 'bar,', 'baz,', 'qux,']
# to get rid of trailing
result[-1] = result[-1].strip(sep)
#['foo,', 'bar,', 'baz,', 'qux']
result = [sep+x for x in text.split(sep)]
#[',foo', ',bar', ',baz', ',qux']
# to get rid of trailing
result[0] = result[0].strip(sep)
#['foo', ',bar', ',baz', ',qux']
Separator as it's own element:
result = [u for x in text.split(sep) for u in (x, sep)]
#['foo', ',', 'bar', ',', 'baz', ',', 'qux', ',']
results = result[:-1] # to get rid of trailing
Another no-regex solution that works well on Python 3
# Split strings and keep separator
test_strings = ['<Hello>', 'Hi', '<Hi> <Planet>', '<', '']
def split_and_keep(s, sep):
if not s: return [''] # consistent with string.split()
# Find replacement character that is not used in string
# i.e. just use the highest available character plus one
# Note: This fails if ord(max(s)) = 0x10FFFF (ValueError)
p=chr(ord(max(s))+1)
return s.replace(sep, sep+p).split(p)
for s in test_strings:
print(split_and_keep(s, '<'))
# If the unicode limit is reached it will fail explicitly
unicode_max_char = chr(1114111)
ridiculous_string = '<Hello>'+unicode_max_char+'<World>'
print(split_and_keep(ridiculous_string, '<'))
One Lazy and Simple Solution
Assume your regex pattern is split_pattern = r'(!|\?)'
First, you add some same character as the new separator, like '[cut]'
new_string = re.sub(split_pattern, '\\1[cut]', your_string)
Then you split the new separator, new_string.split('[cut]')
You can also split a string with an array of strings instead of a regular expression, like this:
def tokenizeString(aString, separators):
#separators is an array of strings that are being used to split the string.
#sort separators in order of descending length
separators.sort(key=len)
listToReturn = []
i = 0
while i < len(aString):
theSeparator = ""
for current in separators:
if current == aString[i:i+len(current)]:
theSeparator = current
if theSeparator != "":
listToReturn += [theSeparator]
i = i + len(theSeparator)
else:
if listToReturn == []:
listToReturn = [""]
if(listToReturn[-1] in separators):
listToReturn += [""]
listToReturn[-1] += aString[i]
i += 1
return listToReturn
print(tokenizeString(aString = "\"\"\"hi\"\"\" hello + world += (1*2+3/5) '''hi'''", separators = ["'''", '+=', '+', "/", "*", "\\'", '\\"', "-=", "-", " ", '"""', "(", ")"]))
Here is a simple .split solution that works without regex.
This is an answer for Python split() without removing the delimiter, so not exactly what the original post asks but the other question was closed as a duplicate for this one.
def splitkeep(s, delimiter):
split = s.split(delimiter)
return [substr + delimiter for substr in split[:-1]] + [split[-1]]
Random tests:
import random
CHARS = [".", "a", "b", "c"]
assert splitkeep("", "X") == [""] # 0 length test
for delimiter in ('.', '..'):
for _ in range(100000):
length = random.randint(1, 50)
s = "".join(random.choice(CHARS) for _ in range(length))
assert "".join(splitkeep(s, delimiter)) == s
# This keeps all separators in result
##########################################################################
import re
st="%%(c+dd+e+f-1523)%%7"
sh=re.compile('[\+\-//\*\<\>\%\(\)]')
def splitStringFull(sh, st):
ls=sh.split(st)
lo=[]
start=0
for l in ls:
if not l : continue
k=st.find(l)
llen=len(l)
if k> start:
tmp= st[start:k]
lo.append(tmp)
lo.append(l)
start = k + llen
else:
lo.append(l)
start =llen
return lo
#############################
li= splitStringFull(sh , st)
['%%(', 'c', '+', 'dd', '+', 'e', '+', 'f', '-', '1523', ')%%', '7']
replace all seperator: (\W) with seperator + new_seperator: (\W;)
split by the new_seperator: (;)
def split_and_keep(seperator, s):
return re.split(';', re.sub(seperator, lambda match: match.group() + ';', s))
print('\W', 'foo/bar spam\neggs')
If one wants to split string while keeping separators by regex without capturing group:
def finditer_with_separators(regex, s):
matches = []
prev_end = 0
for match in regex.finditer(s):
match_start = match.start()
if (prev_end != 0 or match_start > 0) and match_start != prev_end:
matches.append(s[prev_end:match.start()])
matches.append(match.group())
prev_end = match.end()
if prev_end < len(s):
matches.append(s[prev_end:])
return matches
regex = re.compile(r"[\(\)]")
matches = finditer_with_separators(regex, s)
If one assumes that regex is wrapped up into capturing group:
def split_with_separators(regex, s):
matches = list(filter(None, regex.split(s)))
return matches
regex = re.compile(r"([\(\)])")
matches = split_with_separators(regex, s)
Both ways also will remove empty groups which are useless and annoying in most of the cases.
install wrs "WITHOUT REMOVING SPLITOR" BY DOING
pip install wrs
(developed by Rao Hamza)
import wrs
text = "Now inbox “how to make spam ad” Invest in hard email marketing."
splitor = 'email | spam | inbox'
list = wrs.wr_split(splitor, text)
print(list)
result:
['now ', 'inbox “how to make ', 'spam ad” invest in hard ', 'email marketing.']
I had a similar issue trying to split a file path and struggled to find a simple answer.
This worked for me and didn't involve having to substitute delimiters back into the split text:
my_path = 'folder1/folder2/folder3/file1'
import re
re.findall('[^/]+/|[^/]+', my_path)
returns:
['folder1/', 'folder2/', 'folder3/', 'file1']
I found this generator based approach more satisfying:
def split_keep(string, sep):
"""Usage:
>>> list(split_keep("a.b.c.d", "."))
['a.', 'b.', 'c.', 'd']
"""
start = 0
while True:
end = string.find(sep, start) + 1
if end == 0:
break
yield string[start:end]
start = end
yield string[start:]
It avoids the need to figure out the correct regex, while in theory should be fairly cheap. It doesn't create new string objects and, delegates most of the iteration work to the efficient find method.
... and in Python 3.8 it can be as short as:
def split_keep(string, sep):
start = 0
while (end := string.find(sep, start) + 1) > 0:
yield string[start:end]
start = end
yield string[start:]
May I just leave it here
s = 'foo/bar spam\neggs'
print(s.replace('/', '+++/+++').replace(' ', '+++ +++').replace('\n', '+++\n+++').split('+++'))
['foo', '/', 'bar', ' ', 'spam', '\n', 'eggs']
Use re.split and also your regular expression comes from variable and also you have multi separator ,you can use as the following:
# BashSpecialParamList is the special param in bash,
# such as your separator is the bash special param
BashSpecialParamList = ["$*", "$#", "$#", "$?", "$-", "$$", "$!", "$0"]
# aStr is the the string to be splited
aStr = "$a Klkjfd$0 $? $#%$*Sdfdf"
reStr = "|".join([re.escape(sepStr) for sepStr in BashSpecialParamList])
re.split(f'({reStr})', aStr)
# Then You can get the result:
# ['$a Klkjfd', '$0', ' ', '$?', ' ', '$#', '%', '$*', 'Sdfdf']
reference: GNU Bash Special Parameters
Some of those answers posted before, will repeat delimiter, or have some other bugs which I faced in my case. You can use this function, instead:
def split_and_keep_delimiter(input, delimiter):
result = list()
idx = 0
while delimiter in input:
idx = input.index(delimiter);
result.append(input[0:idx+len(delimiter)])
input = input[idx+len(delimiter):]
result.append(input)
return result
In the below code, there is a simple, very efficient and well tested answer to this question. The code has comments explaining everything in it.
I promise it's not as scary as it looks - it's actually only 13 lines of code! The rest are all comments, docs and assertions
def split_including_delimiters(input: str, delimiter: str):
"""
Splits an input string, while including the delimiters in the output
Unlike str.split, we can use an empty string as a delimiter
Unlike str.split, the output will not have any extra empty strings
Conequently, len(''.split(delimiter))== 0 for all delimiters,
whereas len(input.split(delimiter))>0 for all inputs and delimiters
INPUTS:
input: Can be any string
delimiter: Can be any string
EXAMPLES:
>>> split_and_keep_delimiter('Hello World ! ',' ')
ans = ['Hello ', 'World ', ' ', '! ', ' ']
>>> split_and_keep_delimiter("Hello**World**!***", "**")
ans = ['Hello', '**', 'World', '**', '!', '**', '*']
EXAMPLES:
assert split_and_keep_delimiter('-xx-xx-','xx') == ['-', 'xx', '-', 'xx', '-'] # length 5
assert split_and_keep_delimiter('xx-xx-' ,'xx') == ['xx', '-', 'xx', '-'] # length 4
assert split_and_keep_delimiter('-xx-xx' ,'xx') == ['-', 'xx', '-', 'xx'] # length 4
assert split_and_keep_delimiter('xx-xx' ,'xx') == ['xx', '-', 'xx'] # length 3
assert split_and_keep_delimiter('xxxx' ,'xx') == ['xx', 'xx'] # length 2
assert split_and_keep_delimiter('xxx' ,'xx') == ['xx', 'x'] # length 2
assert split_and_keep_delimiter('x' ,'xx') == ['x'] # length 1
assert split_and_keep_delimiter('' ,'xx') == [] # length 0
assert split_and_keep_delimiter('aaa' ,'xx') == ['aaa'] # length 1
assert split_and_keep_delimiter('aa' ,'xx') == ['aa'] # length 1
assert split_and_keep_delimiter('a' ,'xx') == ['a'] # length 1
assert split_and_keep_delimiter('' ,'' ) == [] # length 0
assert split_and_keep_delimiter('a' ,'' ) == ['a'] # length 1
assert split_and_keep_delimiter('aa' ,'' ) == ['a', '', 'a'] # length 3
assert split_and_keep_delimiter('aaa' ,'' ) == ['a', '', 'a', '', 'a'] # length 5
"""
# Input assertions
assert isinstance(input,str), "input must be a string"
assert isinstance(delimiter,str), "delimiter must be a string"
if delimiter:
# These tokens do not include the delimiter, but are computed quickly
tokens = input.split(delimiter)
else:
# Edge case: if the delimiter is the empty string, split between the characters
tokens = list(input)
# The following assertions are always true for any string input and delimiter
# For speed's sake, we disable this assertion
# assert delimiter.join(tokens) == input
output = tokens[:1]
for token in tokens[1:]:
output.append(delimiter)
if token:
output.append(token)
# Don't let the first element be an empty string
if output[:1]==['']:
del output[0]
# The only case where we should have an empty string in the output is if it is our delimiter
# For speed's sake, we disable this assertion
# assert delimiter=='' or '' not in output
# The resulting strings should be combinable back into the original string
# For speed's sake, we disable this assertion
# assert ''.join(output) == input
return output
>>> line = 'hello_toto_is_there'
>>> sep = '_'
>>> [sep + x[1] if x[0] != 0 else x[1] for x in enumerate(line.split(sep))]
['hello', '_toto', '_is', '_there']

Categories