Apparently this problem comes up fairly often, after reading
Regular expression to detect semi-colon terminated C++ for & while loops
and thinking about the problem for a while, i wrote a function to return the content contained inside an arbitrary number of nested ()
The function could easily be extended to any regular expression object, posting here for your thoughts and considerations.
any refactoring advice would be appreciated
(note, i'm new to python still, and didn't feel like figuring out how to raise exceptions or whatever, so i just had the function return 'fail' if it couldin't figure out what was going on)
Edited function to take into account comments:
def ParseNestedParen(string, level):
"""
Return string contained in nested (), indexing i = level
"""
CountLeft = len(re.findall("\(", string))
CountRight = len(re.findall("\)", string))
if CountLeft == CountRight:
LeftRightIndex = [x for x in zip(
[Left.start()+1 for Left in re.finditer('\(', string)],
reversed([Right.start() for Right in re.finditer('\)', string)]))]
elif CountLeft > CountRight:
return ParseNestedParen(string + ')', level)
elif CountLeft < CountRight:
return ParseNestedParen('(' + string, level)
return string[LeftRightIndex[level][0]:LeftRightIndex[level][1]]
You don't make it clear exactly what the specification of your function is, but this behaviour seems wrong to me:
>>> ParseNestedParen('(a)(b)(c)', 0)
['a)(b)(c']
>>> nested_paren.ParseNestedParen('(a)(b)(c)', 1)
['b']
>>> nested_paren.ParseNestedParen('(a)(b)(c)', 2)
['']
Other comments on your code:
Docstring says "generate", but function returns a list, not a generator.
Since only one string is ever returned, why return it in a list?
Under what circumstances can the function return the string fail?
Repeatedly calling re.findall and then throwing away the result is wasteful.
You attempt to rebalance the parentheses in the string, but you do so only one parenthesis at a time:
>>> ParseNestedParen(')' * 1000, 1)
RuntimeError: maximum recursion depth exceeded while calling a Python object
As Thomi said in the question you linked to, "regular expressions really are the wrong tool for the job!"
The usual way to parse nested expressions is to use a stack, along these lines:
def parenthetic_contents(string):
"""Generate parenthesized contents in string as pairs (level, contents)."""
stack = []
for i, c in enumerate(string):
if c == '(':
stack.append(i)
elif c == ')' and stack:
start = stack.pop()
yield (len(stack), string[start + 1: i])
>>> list(parenthetic_contents('(a(b(c)(d)e)(f)g)'))
[(2, 'c'), (2, 'd'), (1, 'b(c)(d)e'), (1, 'f'), (0, 'a(b(c)(d)e)(f)g')]
Parentheses matching requires a parser with a push-down automaton. Some libraries exist, but the rules are simple enough that we can write it from scratch:
def push(obj, l, depth):
while depth:
l = l[-1]
depth -= 1
l.append(obj)
def parse_parentheses(s):
groups = []
depth = 0
try:
for char in s:
if char == '(':
push([], groups, depth)
depth += 1
elif char == ')':
depth -= 1
else:
push(char, groups, depth)
except IndexError:
raise ValueError('Parentheses mismatch')
if depth > 0:
raise ValueError('Parentheses mismatch')
else:
return groups
print(parse_parentheses('a(b(cd)f)')) # ['a', ['b', ['c', 'd'], 'f']]
Below is my Python solution with a time complexity of O(N)
str1 = "(a(b(c)d)(e(f)g)hi)"
def content_by_level(str1, l):
level_dict = {}
level = 0
level_char = ''
for s in str1:
if s == '(':
if level not in level_dict:
level_dict[level] = [level_char]
elif level_char != '':
level_dict[level].append(level_char)
level_char = ''
level += 1
elif s == ')':
if level not in level_dict:
level_dict[level] = [level_char]
elif level_char != '':
level_dict[level].append(level_char)
level_char = ''
level -= 1
else:
level_char += s
print(level_dict) # {0: [''], 1: ['a', 'hi'], 2: ['b', 'd', 'e', 'g'], 3: ['c', 'f']}
return level_dict[l]
print(content_by_level(str1,0)) # ['']
print(content_by_level(str1,1)) # ['a', 'hi']
print(content_by_level(str1,2)) # ['b', 'd', 'e', 'g']
print(content_by_level(str1,3)) # ['c', 'f']
#!/usr/bin/env python
import re
def ParseNestedParen(string, level):
"""
Generate strings contained in nested (), indexing i = level
"""
if len(re.findall("\(", string)) == len(re.findall("\)", string)):
LeftRightIndex = [x for x in zip(
[Left.start()+1 for Left in re.finditer('\(', string)],
reversed([Right.start() for Right in re.finditer('\)', string)]))]
elif len(re.findall("\(", string)) > len(re.findall("\)", string)):
return ParseNestedParen(string + ')', level)
elif len(re.findall("\(", string)) < len(re.findall("\)", string)):
return ParseNestedParen('(' + string, level)
else:
return 'fail'
return [string[LeftRightIndex[level][0]:LeftRightIndex[level][1]]]
Tests:
if __name__ == '__main__':
teststring = "outer(first(second(third)second)first)outer"
print(ParseNestedParen(teststring, 0))
print(ParseNestedParen(teststring, 1))
print(ParseNestedParen(teststring, 2))
teststring_2 = "outer(first(second(third)second)"
print(ParseNestedParen(teststring_2, 0))
print(ParseNestedParen(teststring_2, 1))
print(ParseNestedParen(teststring_2, 2))
teststring_3 = "second(third)second)first)outer"
print(ParseNestedParen(teststring_3, 0))
print(ParseNestedParen(teststring_3, 1))
print(ParseNestedParen(teststring_3, 2))
output:
Running tool: python3.1
['first(second(third)second)first']
['second(third)second']
['third']
['first(second(third)second)']
['second(third)second']
['third']
['(second(third)second)first']
['second(third)second']
['third']
>>>
Related
I’m super new to coding and solving an assignment. The problem at hand is to convert an input string of letters and square brackets into a list of letters and lists. For example:the string ‘[abc]’ should return [‘a’,‘b’,‘c’]. I have the code running for a few test cases but it fails for the following string: '[a[b[c]]]' which should return ['a',['b',['c']]], but instead my code returns [ 'a', [ 'b', [ 'c' ] ], [ 'c' ] ]. Can someone please tell me what I’m missing in my logic. Please help.
Here's my logic so far that I've been able to research.
def str2list(s):
out = []
for x in s.split('['):
if ']' in x[:-1]:
x1 = x.split(']')
x1 = [list(x1[0])]+list(x1[1])
elif x[-1:] == ']':
x1 = list(x.strip(']'))
else:
x1 = list(x)
out.extend(x1)
return out
For an arbitrary nests of brackets, it's better to use a recursive function:
def str2list(s):
def inner(item):
while True:
next_item = next(item)
if next_item == '[':
yield [x for x in inner(item)]
elif next_item == ']':
return
else:
yield next_item
return list(next(inner(iter(s))))
str2list('[a[b[c]]]')
Output:
['a', ['b', ['c']]]
When you split the string you remove all the opening brackets("[").
You could do something like this:
from ast import literal_eval
def str2list(s):
output = ""
for i in list(s):
if i != "[" and i != "]" and i != ",":
output += f"'{i}',"
else:
output += i
return literal_eval(output)
print(str2list("[a[b[c]]]"))
I am trying that age old question (there are multitudes of versions around) of finding the longest substring of a string which doesn't contain repeated characters. I can't work out why my attempt doesn't work properly:
def findLongest(inputStr):
resultSet = []
substr = []
for c in inputStr:
print ("c: ", c)
if substr == []:
substr.append([c])
continue
print(substr)
for str in substr:
print ("c: ",c," - str: ",str,"\n")
if c in str:
resultSet.append(str)
substr.remove(str)
else:
str.append(c)
substr.append([c])
print("Result set:")
print(resultSet)
return max(resultSet, key=len)
print (findLongest("pwwkewambb"))
When my output gets to the second 'w', it doesn't iterate over all the substr elements. I think I've done something silly, but I can't see what it is so some guidance would be appreciated! I feel like I'm going to kick myself at the answer...
The beginning of my output:
c: p
c: w
[['p']]
c: w - str: ['p']
c: w
[['p', 'w'], ['w']]
c: w - str: ['p', 'w'] # I expect the next line to say c: w - str: ['w']
c: k
[['w'], ['w']] # it is like the w was ignored as it is here
c: k - str: ['w']
c: k - str: ['w']
...
EDIT:
I replaced the for loop with
for idx, str in enumerate(substr):
print ("c: ",c," - str: ",str,"\n")
if c in str:
resultSet.append(str)
substr[idx] = []
else:
str.append(c)
and it produces the correct result. The only thing is that the empty element arrays get set with the next character. It seems a bit pointless; there must be a better way.
My expected output is kewamb.
e.g.
c: p
c: w
[['p']]
c: w - str: ['p']
c: w
[['p', 'w'], ['w']]
c: w - str: ['p', 'w']
c: w - str: ['w']
c: k
[[], [], ['w']]
c: k - str: []
c: k - str: []
c: k - str: ['w']
c: e
[['k'], ['k'], ['w', 'k'], ['k']]
c: e - str: ['k']
c: e - str: ['k']
c: e - str: ['w', 'k']
c: e - str: ['k']
...
Edit, per comment by #seymour on incorrect responses:
def find_longest(s):
_longest = set()
def longest(x):
if x in _longest:
_longest.clear()
return False
_longest.add(x)
return True
return ''.join(max((list(g) for _, g in groupby(s, key=longest)), key=len))
And test:
In [101]: assert find_longest('pwwkewambb') == 'kewamb'
In [102]: assert find_longest('abcabcbb') == 'abc'
In [103]: assert find_longest('abczxyabczxya') == 'abczxy'
Old answer:
from itertools import groupby
s = set() ## for mutable access
''.join(max((list(g) for _, g in groupby('pwwkewambb', key=lambda x: not ((s and x == s.pop()) or s.add(x)))), key=len))
'kewamb'
groupby returns an iterator grouped based on the function provided in the key argument, which by default is lambda x: x. Instead of the default we are utilizing some state by using a mutable structure (which could have been done a more intuitive way if using a normal function)
lambda x: not ((s and x == s.pop()) or s.add(x))
What is happening here is since I can't reassign a global assignment in a lambda (again I can do this, using a proper function), I just created a global mutable structure that I can add/remove. The key (no pun) is that I only keep elements that I need by using a short circuit to add/remove items as needed.
max and len are fairly self explanatory, to get the longest list produced by groupby
Another version without the mutable global structure business:
def longest(x):
if hasattr(longest, 'last'):
result = not (longest.last == x)
longest.last = x
return result
longest.last = x
return True
''.join(max((list(g) for _, g in groupby('pwwkewambb', key=longest)), key=len))
'kewamb'
Not sure what is wrong in your attempt, but it's complex and in:
for str in substr:
print ("c: ",c," - str: ",str,"\n")
if c in str:
resultSet.append(str)
substr.remove(str)
you're removing elements from a list while iterating on it: don't do that, it gives unexpected results.
Anyway, my solution, not sure it's intuitive, but it's probably simpler & shorter:
slice the string with an increasing index
for each slice, create a set and store letters until you reach the end of the string or a letter is already in the set. Your index is the max length
compute the max of this length for every iteration & store the corresponding string
Code:
def findLongest(s):
maxlen = 0
longest = ""
for i in range(0,len(s)):
subs = s[i:]
chars = set()
for j,c in enumerate(subs):
if c in chars:
break
else:
chars.add(c)
else:
# add 1 when end of string is reached (no break)
# handles the case where the longest string is at the end
j+=1
if j>maxlen:
maxlen=j
longest=s[i:i+j]
return longest
print(findLongest("pwwkewambb"))
result:
kewamb
Depends on your definition of repeated characters: if you mean consecutive, then the approved solution is slick, but not of characters appearing more than once (e.g.: pwwkewabmb -> 'kewabmb' ).
Here's what I came up with (Python 2):
def longest(word):
begin = 0
end = 0
longest = (0,0)
for i in xrange(len(word)):
try:
j = word.index(word[i],begin,end)
# longest?
if end-begin >= longest[1]-longest[0]:
longest = (begin,end)
begin = j+1
if begin==end:
end += 1
except:
end = i+1
end=i+1
if end-begin >= longest[1]-longest[0]:
longest = (begin,end)
return word[slice(*longest)]
Thus
>>> print longest('pwwkewabmb')
kewabm
>>> print longest('pwwkewambb')
kewamb
>>> print longest('bbbb')
b
My 2-cents:
from collections import Counter
def longest_unique_substr(s: str) -> str:
# get all substr-ings from s, starting with the longest one
for substr_len in range(len(s), 0, -1):
for substr_start_index in range(0, len(s) - substr_len + 1):
substr = s[substr_start_index : substr_start_index + substr_len]
# check if all substr characters are unique
c = Counter(substr)
if all(v == 1 for v in c.values()):
return substr
# ensure empty string input returns ""
return ""
Run:
In : longest_unique_substr('pwwkewambb')
Out: 'kewamb'
s=input()
ma=0
n=len(s)
l=[]
a=[]
d={}
st=0;i=0
while i<n:
if s[i] not in d:
d[s[i]]=i
l.append(s[i])
else:
t=d[s[i]]
d[s[i]]=i
s=s[t+1:]
d={}
n=len(s)
if len(l)>=3:
a.append(l)
ma=max(ma,len(l))
l=[];i=-1
i=i+1
if len(l)!=0 and len(l)>=3:
a.append(l)
ma=max(ma,len(l))
if len(a)==0:
print("-1")
else:
for i in a:
if len(i)==ma:
for j in i:
print(j,end="")
break
I have a list of strings like this:
lst= ['(', 'A', '(', 'B', '(', 'C', 'D', ')', '(', 'E', 'F', ')', ')', '(', 'G', 'H', ')', ')']
Joined together it looks like this:
(A(B(CD)(EF))(GH))
I want to traverse the list by element and store values into two lists like this: ['A','B','G'] ['B', 'C', 'E']
I was trying to do this:
l1=[]
for i in range(len(lst)):
if lst[i] == '(':
l1.append(lst[i+1])
How can I break the computation such that it counts the number of opening and closing paranthesis and when the opening brackets get its closing bracket, then it adds the element after the next opening bracket, to get the result: ['A','B','G'] ['B', 'C', 'E'] ?
import re
string = ''.join(lst)
results = []
for match in re.finditer('\w\(', string):
parens = 0
substring = string[match.start():]
results.append([substring[0]])
for ii, ch in enumerate(substring):
if ch == '(':
parens += 1
if parens == 1:
results[-1].append(substring[ii+1])
elif ch == ')':
parens -= 1
if parens < 0:
break
Or without regex:
results = []
for jj in range(len(lst) - 1):
if lst[jj] != ')' and lst[jj+1] == '(':
parens = 0
results.append([lst[jj]])
substring = lst[jj:]
for ii, ch in enumerate(substring):
if ch == '(':
parens += 1
if parens == 1:
results[-1].append(substring[ii+1])
elif ch == ')':
parens -= 1
if parens < 0:
break
If I understand you correctly, the string represents a tree structure, with each node having a name and an arbitrary number of children. In s-expression style, this name is the first entry in a list (if there is a list; leaf nodes are only named).
(A
(B
(CD)
(EF))
(GH))
Within this tree, you want a printout of the nodes with multiple branches, including the names of those branches but not their contents. Parsing them is indeed easiest with a stack, possibly implicit in recursion.
from pprint import pprint
instr="(A(B(CD)(EF))(GH))"
nodes=[]
curnode=nodes
nodestack=[]
for char in instr:
if char == '(':
nodestack.append(curnode)
curnode.append([])
curnode=curnode[-1]
elif char == ')':
curnode=nodestack.pop()
else:
curnode.append(char)
# Show that the tree is now converted into list form
pprint(nodes, width=20)
def name(node):
return node[0]
def branches(tree):
if len(tree)>=3:
yield map(name,tree) # exploits that names are only 1 char
for branch in tree[1:]:
# search deeper also
for subbranch in branches(branch):
yield subbranch
for root in nodes:
pprint(list(branches(root)))
It's certainly possible to merge the operations by printing branching nodes as they finish parsing (in the ')' case).
I want to remove a character from a string in permutation....
Let us say that I have a function
def (string,char):
# remove char from string
Say I have aAabbAA as string and A as char then I want the strings [aabb,aAabb,aabbA,aabbA, aabbAA,aAabbA ,aAabbA ] as output that is A gets removed 3 times , 2 times , 1 times.
What is the best way in which I can do that ??
Thanks a lot....
Here is one crazy idea using recursion:
def f(s, c, start):
i = s.find(c, start)
if i < 0:
return [s]
else:
return f(s, c, i+1) + f(s[:i]+s[i+1:], c, i)
s = 'aAabbAA'
print f(s, 'A', 0)
# ['aAabbAA', 'aAabbA', 'aAabbA', 'aAabb', 'aabbAA', 'aabbA', 'aabbA', 'aabb']
Edit: Using set:
def f(s, c, start):
i = s.find(c, start)
if i < 0:
return set([s])
else:
return set.union(f(s, c, i+1), f(s[:i]+s[i+1:], c, i))
s = 'aAabbAA'
print f(s, 'A', 0)
# set(['aAabbA', 'aabbAA', 'aAabbAA', 'aabb', 'aAabb', 'aabbA'])
Edit 2: Using ternary operator:
def f(s, c, start):
i = s.find(c, start)
return [s] if i < 0 else f(s, c, i+1) + f(s[:i]+s[i+1:], c, i)
s = 'aAabbAA'
print f(s, 'A', 0)
# ['aAabbAA', 'aAabbA', 'aAabbA', 'aAabb', 'aabbAA', 'aabbA', 'aabbA', 'aabb']
Edit 3: timeit:
In [32]: timeit.timeit('x = f("aAabbAA", "A", 0)',
'from test3 import f', number=10000)
Out[32]: 0.11674594879150391
In [33]: timeit.timeit('x = deperm("aAabbAA", "A")',
'from test4 import deperm', number=10000)
Out[33]: 0.35839986801147461
In [34]: timeit.timeit('x = f("aAabbAA"*6, "A", 0)',
'from test3 import f', number=1)
Out[34]: 0.45998811721801758
In [35]: timeit.timeit('x = deperm("aAabbAA"*6, "A")',
'from test4 import deperm', number=1)
Out[35]: 7.8437530994415283
Here's a solution that might work. Basically I use a product of all possible combinations of the target character and an empty string.
from itertools import product
def deperm(st, c):
rsts = []
indexes = [i for i, s in enumerate(st) if s == c]
for i in product([c, ''], repeat=len(indexes)):
newst = ''
for j, ch in enumerate(st):
if j in indexes:
newst += i[indexes.index(j)]
else:
newst += ch
rsts.append(newst)
return rsts
for i in deperm('aAabbAA', 'A'):
print i
This outputs:
aAabbAA
aAabbA
aAabbA
aAabb
aabbAA
aabbA
aabbA
aabb
A recursive algorithm like so might help you here. Sorry I'm not a python champ, so you might have to tweak the syntax yourself. Psuedo code:
// returns a set of strings (permutations)
def permutation(string, char)
if len(string) == 0
return [] // return empty set
// get the set of permutations of suffix string recursively
set_of_perm_suffix = permutation(string[1:], char)
// prepend char to every string in set_of_perm
appended_set = prepend_char(set_of_perm_suffix , string[0])
// if the first char matches the one we should remove, we could either
// remove it or keep it.
if (string[0] == char)
return union_of_sets(set_of_perm_suffix , appended_set)
else
// the first char doesn't match the one we should remove,
// we need to keep it in every string of the set
return appended_set
I'd like to create a slice object from a string; right now the only way seems through a cumbersome hacky eval statement
class getslice:
def __getitem__(self, idx): return idx[0]
eval("getslice()[%s, 1]" %(":-1"))
thanks in advance.
Edit: Sorry if the original prompt was not clear, the input in this case was ":-1". The point was to parse the string. Ignacio Vazquez-Abrams's response at least solved the problem (and seems to work with reverse indexing as well), but I think my solution above is still more clear if not conceptually clean (and will work correctly if Python ever changes slicing syntax).
slice(*map(lambda x: int(x.strip()) if x.strip() else None, mystring.split(':')))
for single arg slices '-1' or '1' so when mystring.split(':')==1 you just call int(x)
On request, took it out of comment section.
If you want a slice object, why don't you just instantiate one?
s = slice(start, stop, step)
What are you meaning by "creating it from a string"?
slice(*[{True: lambda n: None, False: int}[x == ''](x) for x in (mystring.split(':') + ['', '', ''])[:3]])
I end up here because I wanted my script to accept a python-like splice argument and render it into a list of integers, I did it with a function that seems like it answers the OP's question:
# create a slice object from a string
def get_slice_obj(slicearg):
slice_ints = tuple([ int(n) for n in slicearg.split(':') ])
return apply(slice, slice_ints)
def ints_from_slicearg(slicearg):
slice_obj = get_slice_obj(slicearg)
return(range(slice_obj.start or 0, slice_obj.stop or -1, slice_obj.step or 1))
for t in ['1', '1:3', '4:9:2']:
print t, "=>", ints_from_slicearg(t)
Output:
1 => [0]
1:3 => [1, 2]
4:9:2 => [4, 6, 8]
Here's another method (just a consolidation of the others posted here):
def make_slice(expr):
def to_piece(s):
return s and int(s) or None
pieces = map(to_piece, expr.split(':'))
if len(pieces) == 1:
return slice(pieces[0], pieces[0] + 1)
else:
return slice(*pieces)
Example usages:
In [1]: make_slice(':')
Out[1]: slice(None, None, None)
In [2]: make_slice(':-2')
Out[2]: slice(None, -2, None)
In [3]: x = [1, 2, 3]
In [4]: x[make_slice('::-1')]
Out[4]: [3, 2, 1]
The one-liner from Ignacio Vazquez-Abrams is short but hardly readable and handles a single number inconsistently with slice. This tries to parse it in a cleaner way.
def parse_slice(value):
"""
Parses a `slice()` from string, like `start:stop:step`.
"""
if value:
parts = value.split(':')
if len(parts) == 1:
# slice(stop)
parts = [None, parts[0]]
# else: slice(start, stop[, step])
else:
# slice()
parts = []
return slice(*[int(p) if p else None for p in parts])
# unit tests:
try:
assert parse_slice('')
assert False, 'It should raise TypeError'
except TypeError:
pass
assert parse_slice('2') == slice(2)
assert parse_slice('2:3') == slice(2, 3)
assert parse_slice(':3') == slice(None, 3)
assert parse_slice(':') == slice(None, None)
assert parse_slice('2:') == slice(2, None)
assert parse_slice('2:3:4') == slice(2, 3, 4)
assert parse_slice(':3:4') == slice(None, 3, 4)
assert parse_slice('2::4') == slice(2, None, 4)
assert parse_slice('2:3:') == slice(2, 3, None)
assert parse_slice('::4') == slice(None, None, 4)
assert parse_slice('2::') == slice(2, None, None)
assert parse_slice('::') == slice(None, None, None)
assert parse_slice('-12:-13:-14') == slice(-12, -13, -14)
assert parse_slice('2:3:-4') == slice(2, 3, -4)
try:
parse_slice('1:2:3:4')
assert False, 'It should raise TypeError'
except TypeError:
pass
Based on #pprzemak drafted the following function for elaborate parsing:
def parse_slice(v: Text):
"""
Parses text like python "slice" expression (ie ``-10::2``).
:param v:
the slice expression or a lone integer
:return:
- None if input is None/empty
- a ``slice()`` instance (even if input a lone numbrt)
:raise ValueError:
input non-empty but invalid syntax
"""
orig_v = v
v = v and v.strip()
if not v:
return
try:
if ':' not in v:
## A lone number given.
v = int(v)
return slice(v, v + 1)
return slice(*map(lambda x: int(x.strip()) if x.strip() else None,
v.split(':')))
except Exception:
pass
## An alternative is to return `slice(None)` here.
raise trt.TraitError("Syntax-error in '%s' slice!" % orig_v)
How 'bout this (for simple non empty slice intervals) :
sliceStr = "3:8"
mySlice = slice( *map(int, sliceStr.split(':') ) )
I just needed to do this 12 years later so here's my answer using regex :)
import re
def parse_slice(string: str) -> slice:
"""
Parse a string representation of a slice and return a slice object
"""
# Matches one required colon, one optional colon, and up to three
# positive or negative numbers between them
match = re.match(r"^(-?[\d]*):(-?[\d]*)[:]?(-?[\d]*)$", string)
if match:
args = tuple(map(lambda s: int(s) if s else None, match.group(1, 2, 3)))
return slice(*args)
raise ValueError("Could not parse slice")
A slice object is usually created using subscript notation, this notation uses slice() internally, as stated on the slice() documentation. What you want to do is:
your_string[start:end]
From the python tutorial:
Strings can be subscripted (indexed);
like in C, the first character of a
string has subscript (index) 0. There
is no separate character type; a
character is simply a string of size
one. Like in Icon, substrings can be
specified with the slice notation: two
indices separated by a colon.
>>> word = 'Help' + 'A'
>>> word[4]
'A'
>>> word[0:2]
'He'
>>> word[2:4]
'lp'
Slice indices have useful defaults; an
omitted first index defaults to zero,
an omitted second index defaults to
the size of the string being sliced.
>>> word[:2] # The first two characters
'He'
>>> word[2:] # Everything except the first two characters
'lpA'
My solution to parse numpy style advanced indexing from string: my gist.
Although this is an old post, it's the only one I can find on this topic. Hope it helps.
Upon suggestion, I paste the code here, which could be a little bit long ... The code usage is (assuming a is an array-like object): a[parse_slice('1')] gives a[1]; a[parse_slice('2:,-1')] gives a[2:,-1]; etc.
import re
SLICE_TEMPLATES = [
('s', r'(?P<i>[+-]?\d+)'),
('sp', r'\((?P<i>[+-]?\d+)\)'),
('a', r'::?'),
('ri-', r'(?P<i>[+-]?\d+)::?'),
('ri-k', r'(?P<i>[+-]?\d+)::(?P<k>[+-]?\d+)'),
('r-j', r':(?P<j>[+-]?\d+):?'),
('r-jk', r':(?P<j>[+-]?\d+):(?P<k>[+-]?\d+)'),
('rij', r'(?P<i>[+-]?\d+):(?P<j>[+-]?\d+):?'),
('rijk', r'(?P<i>[+-]?\d+):(?P<j>[+-]?\d+):(?P<k>[+-]?\d+)'),
('r--k', r'::(?P<k>[+-]?\d+)'),
('l', r'\.\.\.'),
('eb', r'\[(?P<e>[+-]?\d+(,[+-]?\d+)*,?)\]'),
('ep', r'\((?P<e>[+-]?\d+(,[+-]?\d+)+,?)\)'),
('ep1', r'\((?P<e>[+-]?\d+,)\)'),
]
SLICE_TEMPLATES = [(k, re.compile(v)) for k, v in SLICE_TEMPLATES]
def tokenize_slice_groups(string):
# tokenize groups
groups = []
sbuf = []
expecting = {'(': ')', '[': ']'}
pbbuf = []
LEGAL_CHARS = '0123456789()[]+-:.'
WHITESPACE_CHARS = ' \t'
for c in string:
if c in WHITESPACE_CHARS:
pass
elif c == ',':
if len(pbbuf) not in (0, 2):
sbuf.append(c)
else:
groups.append(''.join(sbuf))
sbuf.clear()
pbbuf.clear()
elif c in LEGAL_CHARS:
sbuf.append(c)
if c in '([':
if pbbuf:
raise ValueError('too many brackets in axis {}'.format(
len(groups)))
pbbuf.append(c)
elif c in ')]':
if not pbbuf:
raise ValueError('brackets not match in axis {}'.format(
len(groups)))
if c != expecting[pbbuf[0]]:
raise ValueError('brackets not match in axis {}'.format(
len(groups)))
pbbuf.append(c)
else:
raise ValueError('illegal char `{}\''.format(c))
groups.append(''.join(sbuf))
return groups
def parse_slice_group(string):
for name, tem in SLICE_TEMPLATES:
matched = tem.fullmatch(string)
if matched:
if name[0] == 's':
return int(matched.group('i'))
if name[0] == 'a':
return slice(None, None, None)
if name[0] == 'r':
i, j, k = None, None, None
if 'i' in name:
i = int(matched.group('i'))
if 'j' in name:
j = int(matched.group('j'))
if 'k' in name:
k = int(matched.group('k'))
return slice(i, j, k)
if name[0] == 'l':
return ...
# if name[0] == 'e'
return list(map(int, filter(None, matched.group('e').split(','))))
raise ValueError('illegal group "{}"'.format(string))
def parse_slice(string):
groups = tokenize_slice_groups(string)
if groups == ['']:
raise ValueError('index must not be empty')
if groups and groups[-1] == '':
del groups[-1]
index = tuple(map(parse_slice_group, groups))
if index.count(...) > 1:
raise ValueError('ellipsis may occur at most once')
return index