Splitting C string in Python - python

I would like to split a string similar to
'abc "defg hijk \\"l; mn\\" opqrs"; tuv'
into
(['abc', '"defg hijk \\"l; mn\\" opqrs"'], 33)
i.e. I don't want to break on semicolon inside (nested) quotes. What's the easiest way, tokenize? It doesn't hurt if it's fast, but short is better.
Edit: I forgot one more detail that makes it even more tricky. I need the position of the semicolon that is cutting off the string, or -1 if there is none. (I'm doing changes to legacy code that used to be recursive, but stackoverflowed when the string became very long.)

It's unlikely there is an easy way to solve this without a proper parser. You could probably get away with a hand built parser that doesn't require tokenizing though.
Something like the following should be a good guide:
def parse(s):
cur_s = []
strings = []
def flush_string():
strings.push(''.join(cur_s))
cur_s = []
def handle_special_cases():
# TODO: Fill this in
for c in s:
if c == ';':
break
elif c in ['\\' '"']:
handle_special_cases()
elif c == ' ':
flush_string()
else:
cur_s.push(c)
flush_string()
return strings

It's a stateful search, so simple stateless operations are not available. Here's a simple char-by-char stateful evaluator that might meet your "short" without resorting to full tokenization/parsing:
#!/usr/bin/env python
inp="""abc "defg hijk \\"l; mn\\" opqrs"; tuv'`"""
def words_to_semi(inpstr):
ret = ['']
st8 = 1 # state: 1=reg, 2=in quotes, 3=escaped quote, 4=escaped reg, 0=end
ops = { 1 : {' ': lambda c: (None,1),
'"': lambda c: (c,2),
';': lambda c: ('',0),
'\\': lambda c: (c,4),
},
2 : {'\\': lambda c: (c,3),
'"': lambda c: (c,1),
},
3 : {None: lambda c: (c,2)},
4 : {None: lambda c: (c,1)},
}
pos = 0
for C in inpstr:
oc,st8 = ops[st8].get(C, ops[st8].get(None, lambda c:(c,st8)))(C)
if not st8: break
if oc is None:
ret.append('')
else:
ret[-1] += oc
pos = pos + 1
return ret, pos
print str(words_to_semi(inp))
Just modify the ops dict (and add new states) to handle other cases; everything else is generic code.

Here's the brute-force method I went with. Brrr...
def f(s):
instr = False
inescape = False
a = ''
rs = []
cut_index = -1
for idx,ch in enumerate(s):
if instr:
a += ch
if inescape:
inescape = False
elif ch == '\\':
inescape = True
elif ch == '"':
if a:
rs += [a]
a = ''
instr = False
elif ch == '"':
if a:
rs += [a]
a = ch
instr = True
elif ch == ';':
if a:
rs += [a]
cut_index = idx
break
elif ch == ' ' or ch == '\t' or ch == '\n':
if a:
rs += [a]
a = ''
else:
a += ch
return rs, cut_index
f('abc "defg hijk \\"l; mn\\" opqrs"; tuv')

Related

Remove every special character from start of string and store it in a variable

I am trying to build a Pig Latin translator. I want to remove all the characters on the start of a string and store them in a variable. I have already done it to the end, but for some reason when I try and do it the the start it messes up the word.
For example, when I input *test* it returns *stteay* when it should return *esttay*. Also if you put in test* it works fine. I have no clue why.
The funny thing is, that if I only put a special character to the end of the string it works fine! The two pieces of code for extracting the special chars are pretty much identical as well.
Here is the code:
def special_chars_check(text: str) -> bool:
return any(c for c in text if not c.isalnum() and not c.isspace())
def convert(stuff):
words = stuff.split()
output = ''
for i in words:
post_special_chars = ''
reversed_temp = i[::-1]
for c in reversed_temp:
if special_chars_check(reversed_temp[0]):
if special_chars_check(c):
post_special_chars = f'{post_special_chars}{reversed_temp[0]}'
reversed_temp = reversed_temp[1:]
i_temp = reversed_temp[::-1]
else:
break
if len(post_special_chars) == 0:
i_temp = i
pre_special_chars = ''
normal_temp = i_temp
for c in normal_temp:
if special_chars_check(normal_temp[0]):
if special_chars_check(c):
pre_special_chars = f'{pre_special_chars}{normal_temp[0]}'
normal_temp = normal_temp[1:]
i_temp = normal_temp
else:
break
if len(pre_special_chars) == 0:
i_temp = i
allcaps = False
firstcaps = False
if i.isupper():
allcaps = True
elif i[0].isupper():
firstcaps = True
i_temp2 = i_temp
vowel_in = False
for c in i:
if c in 'aeiouAEIOU':
vowel_in = True
if i[0] in 'aeiouAEIOU' and vowel_in:
if allcaps:
output = f'{output} {pre_special_chars}{i_temp.upper()}WAY{post_special_chars}'
elif firstcaps:
temp_output = i.lower()
temp_char2 = temp_output[0]
temp_output = temp_output[1:]
final_i_temp = f'{temp_char2.upper()}{temp_output}'
output = f'{output} {pre_special_chars}{final_i_temp}way{post_special_chars}'
else:
output = f'{output} {pre_special_chars}{i_temp}way{post_special_chars}'
elif i[0] not in 'aeiouAEIOU' and vowel_in:
fulfilled = False
for c in i:
if not fulfilled:
if c in 'aeiouyAEIOUY':
if allcaps:
output = f'{output} {pre_special_chars}{i_temp.upper()}AY{post_special_chars}'
elif firstcaps:
temp_output = i_temp.lower()
temp_char2 = temp_output[0]
temp_output = temp_output[1:]
final_i_temp = f'{temp_char2.upper()}{temp_output}'
output = f'{output} {pre_special_chars}{final_i_temp}ay{post_special_chars}'
else:
output = f'{output} {pre_special_chars}{i_temp}ay{post_special_chars}'
fulfilled = True
else:
temp_char = i_temp2[0]
i_temp = f'{i_temp2[1:]}{temp_char}'
i_temp2 = i_temp
fulfilled = False
else:
output = f'{output} {i}'
try:
output[1:]
except:
return output
else:
return output[1:]
while True:
print(convert(input('Enter text: ')))
It would really help if someone could point out my mistake.
Thankyou!
I just found the answer... It was a silly little mistake. On line 63 for c in i: needed to be for c in i_temp: Thankyou to #JustLearning for the advice.

printing out or returning strings between brackets

I'm trying to write a function in similar manner as started, so that I will get what it's doing. I'm assuming this can be done with one line of code, with some fancy functions, but for the sake of practice and understanding I'm trying to come up with similar solution.
The task is the following: the function takes a text once it encounters enclosed square brackets [ word ] It should print out or return all words which are between square brackets. For example, if the text string would be "[a]n example[ string]", you are expected to print out "a string".
def string():
text = "some random text [and I need this bit of txt] but I don't know how to continue [to get this bit as well]"
for i in text:
for j in range(len(text)):
if text[j] == '[':
new = text.find(']')
return(text[j+1:new])
print(string())
Try this:
def extract(text, skip_chars=("\n", )):
output = ""
flag = False
for c in text:
if c == "]":
flag = False
if flag and not c in skip_chars:
output += c
if c == "[":
flag = True
return output
print(extract("""[a]n example[
stri
ng]"""))
# -> "a string"
def string():
result = []
text = "some random text [and I need this bit of txt] but I don't know how to continue [to get this bit as well]"
for i in text:
if i == '[':
new = text.find(']')
result.append(text[text.index(i) + 1:new])
return " ".join(result)
print(string())
def parse(source):
i = source.index("[") # throw an exception
result = ""
while i < len(source):
if s[i] == "[":
i += 1
while i < len(source):
temp = ""
if source[i] == "]":
result += temp
break;
temp += source[i]
i += 1
i += 1
return result

Find bracket which are not closed without using regex in python

I am trying to find out if any bracket is not closed without using regex, this is what I am trying but it is failed when the string are like "re(d))()(()"
def bracket(str):
return [0,1][str.count(')') == str.count('(')]
s = "re(d))()(()"
print bracket(s)
Is there any better way to do it.
Something like this?
def check_brackets(s):
counter = 0
for chr in s:
if chr == "(":
counter += 1
elif chr == ")":
counter -= 1
if counter < 0:
return False
return counter == 0
EDIT: Here's how you can do that with many different bracket types:
BRACKETS = ("()", "[]", "{}")
def check_brackets(s):
counter = []
for chr in s:
for br in BRACKETS:
open = br[0]
close = br[1]
if chr == open:
counter.append(open)
break
elif chr == close:
try:
last_br = counter.pop()
except IndexError:
return False
if last_br != open: # ensures that the end matches the beginnig
return False
return not bool(counter)
Note that it will mark ([)] as invalid (which is as it should be).
Using the approach from Shunting-yard algorithm:
from collections import deque
def solve(s):
queue = deque()
for c in s:
if c == ')':
if queue:
queue.pop()
continue
return False
elif c == '(':
queue.append(c)
return not bool(queue)
Demo:
>>> solve("re(d))()(()")
False
>>> solve("(re(d)()())")
True
>>> solve("(re(d)()((())))")
True
>>> solve(")))(((")
False
In [44]: from collections import deque
...: def solve(s):
...: queue = deque()
...: for c in s:
...: if c == ')':
...: if queue:
...: queue.pop()
...: else:
...: return False
...: elif c == '(':
...: queue.append(c)
...: return not bool(queue)
...:
In [45]: print solve('()'), solve('())'), solve('re(d))()(()')
True False False
it's important to check if the deque being empty when a ')' comes

regex to replace regex

I have this regex for getting strings in Python code:
x1 = re.compile('''((?P<unicode>u?)(?P<c1>'|")(?P<data>.+?)(?P<c2>'|"))''')
I want to extract the data and c1,c2 parts of this regex to make a replace string (if c1 == c2)
Something like:
repl = "u<c1><data><c2>"
How can I do this??
Is that possible in one line or by using re.sub?
UPDATE:
My new code:
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def repl(match):
if '#' in match.string:
### Confused
return "u%(c)s%(data)s%(c)s" % m.groupdict()
fcode = '\n'.join([re.sub(x1,repl,i) for i in scode.splitlines()])
Here, I am having problems to determine how to not change strings in comments, what do I have to do to ignore the comments??
Say you have a pattern:
pattern = r'''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''' # did a little tweak
Match a string:
m = re.search(pattern, "print('hello')")
What you got:
>>> m.groups()
('', '"', 'hello')
>>> m.groupdict()
{'c': '"', 'unicode': '', 'data': 'hello'}
Now you can do whatever you want with these:
>>> 'u{c}{data}{c}'.format_map(m.groupdict())
'u"hello"'
Maybe you are using Python 2.x:
>>> 'u{c}{data}{c}'.format(**m.groupdict())
'u"hello"'
Or even you like old %
>>> "u%(c)s%(data)s%(c)s" % m.groupdict()
'u"hello"'
Edited:
The regex solution can't handle some situations correctly.
So I used a 2to3 hack(it's actually 3to2, and still can't solve everything):
cd /usr/lib/python3.3/lib2to3/fixes/
cp fix_unicode.py fix_unicode33.py
Edit fix_unicode33.py
-_literal_re = re.compile(r"[uU][rR]?[\'\"]")
+_literal_re = re.compile(r"[rR]?[\'\"]")
-class FixUnicode(fixer_base.BaseFix):
+class FixUnicode33(fixer_base.BaseFix):
- new.value = new.value[1:]
+ new.value = 'u' + new.value
Now 2to3 --list | grep unicode33 should output unicode33
Then you can run 2to3 -f unicode33 py3files.py.
Remember to remove fix_unicode33.py after
NOTE: In Python3 ur"string" throws SyntaxError. The logic here is simple, modify it to reach your goal.
The long code I ended up with.
x1 = re.compile('''(?P<unicode>u?)(?P<c>'|")(?P<data>.*?)(?P=c)''')
def in_string(text,index):
curr,in_l,in_str,level = '',0,False,[]
for c in text[:index+1]:
if c == '"' or c == "'":
if in_str and curr == c:
instr = False
curr = ''
in_l -= 1
else:
instr = True
curr = c
in_l += 1
level.append(in_l)
return bool(level[index])
def repl(m):
return "u%(c)s%(data)s%(c)s" % m.groupdict()
def handle_hashes(i):
if i.count('#') == 1:
n = i.find('#')
else:
n = get_hash_out_of_string(i)
return re.sub(x1,repl,i[:n]) + i[n:]
def get_hash_out_of_string(i):
n = i.find('#')
curr = i[:]
last = (len(i)-1)-''.join(list(reversed(i))).find('#')
while in_string(curr,n) and n < last:
curr = curr[:n]+' '+curr[n+1:]
n = curr.find('#')
return n

Parentheses pairing ({}[]()<>) issue

I want to be able to pair up all parentheses in a string, if they aren't paired then then they get their index number and False. It seems like it is repeating some values over and over, i.e cl == pop[1]. I have tried to see where the problem is but I can't see it no matter how hard I try. So I'm asking if anyone help me to locate the error and maybe even improve my code ;)
def check_parentheses(string):
pending = 0
brackets = []
'''Checks if parens are paired, otherwise they are bad.'''
parenstack = collections.deque()
for ch in string:
if ch in lrmap:
try:
cl = string.index(ch, pending)
pending = cl + 1
except:
cl = False
if ch in lparens:
parenstack.append([ch, cl])
print parenstack
elif ch in rparens:
try:
pop = parenstack.pop()
if lrmap[pop[0]] != ch:
print 'wrong type of parenthesis popped from stack',\
pop[0], ch, pop[1], cl
brackets.append([pop[1], False])
brackets.append([cl, False])
else:
brackets.append([pop[1], cl])
except IndexError:
print 'no opening parenthesis left in stack'
brackets.append([cl, False])
# if we are not out of opening parentheses, we have a mismatch
for p in parenstack:
brackets.append([p[1],False])
return brackets
You can adapt my code to a similar question:
def Evaluate(str):
stack = []
pushChars, popChars = "<({[", ">)}]"
for c in str :
if c in pushChars :
stack.append(c)
elif c in popChars :
if not len(stack) :
return False
else :
stackTop = stack.pop()
balancingBracket = pushChars[popChars.index(c)]
if stackTop != balancingBracket :
return False
else :
return False
return not len(stack)
iparens = iter('(){}[]<>')
parens = dict(zip(iparens, iparens))
closing = parens.values()
def balanced(astr):
stack = []
for c in astr:
d = parens.get(c, None)
if d:
stack.append(d)
elif c in closing:
if not stack or c != stack.pop():
return False
return not stack
Example:
>>> balanced('[1<2>(3)]')
True
>>> balanced('[1<2(>3)]')
False
BRACES = { '(': ')', '[': ']', '{': '}' }
def group_check(s):
stack = []
for b in s:
c = BRACES.get(b)
if c:
stack.append(c)
elif not stack or stack.pop() != b:
return False
return not stack
Thanks hughdbrown your code was a breeze to get working and it's really short! You've just saved me a headache :D
converted it to pep8 if thats ok :)
Edit
Added support for comments and strings, it will not match inside them.
Added support for easy language brace checking, modify the charset dict.
Correctly paires up, i.e right to left
HTML
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(('<!--', '-->')))
Python
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(("'''", "'''"), ('"""', '"""'), ('#', '\n')))
C++
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(('/*', '*/'), ('//', '\n')))
you get the point? :)
charset = dict(opening='{[(<',\
closing='}])>',\
string = ('"', "'"),\
comment=(('<!--', '-->'), ('"""', '"""'), ('#', '\n')))
allowed = ''.join([x[0][0] + x[1][0] for x in charset['comment']])
allowed += ''.join(charset['string'])
allowed += charset['opening']
allowed += charset['closing']
def brace_check(text):
o = []
c = []
notr = []
found = []
busy = False
last_pos = None
for i in xrange(len(text)):
ch = text[i]
if not busy:
cont = True
for comment in charset['comment']:
if ch == comment[0][0]:
como = text[i:len(comment[0])]
if como == comment[0]:
busy = comment[1]
if ch in charset['opening']:
last_pos = i
cont = False
break
if cont:
if ch in charset['string']:
busy = ch
elif ch in charset['opening']:
o.append((ch, i))
elif ch in charset['closing']:
c.append((ch, i))
else:
if ch == busy[0]:
if len(busy) == 1:
comc = ch
else:
comc = text[i:i + len(busy)]
if comc == busy:
if last_pos is not None:
if busy[-1] in charset['closing']:
found.append((last_pos, i))
last_pos = None
text = text[:i] + '\n' * len(comc) +\
text[i + len(comc):]
busy = not busy
elif busy in charset['string']:
if ch == '\n':
busy = not busy
for t, e in reversed(o):
try:
n = next((b, v) for b, v in c\
if b == charset['closing'][\
charset['opening'].find(t)] and v > e)
c.remove(n)
n = n[1]
if found != []:
if e < found[-1][0] and n > found[-1][0] and n < found[-1][1]\
or e < found[-1][1] and n > found[-1][1] and e > found[-1][0]:
found.append((n, False))
n = False
except StopIteration:
n = False
found.append((e, n))
for t, e in c:
found.append((e, False))
return found
An understandable solution in Python 3:
def check_balanced_string(str):
stack = []
dicc = {'(': ')', '[': ']', '{': '}'}
for char in str:
if char in dicc.keys(): # opening char
stack.append(char)
elif char in dicc.values(): # closing char
if dicc[stack[-1]] == char: # check if closing char corresponds to last opening char
stack.pop()
else:
return False
return not len(stack) # returns True when len == 0
eq = '{1+[3*5+(2+1)]}'
print(check_balanced_string(eq))
Try this:
def matched(s):
stack=[]
open,close="(",")"
for i in s:
if i in open:
stack.append(i)
if i in close:
if len(stack)==0:
return(False)
else:
stack.pop()
if len(stack):
return(False)
else:
return(True)
The below code will display the missing parentheses and the no of times missing in the given string.
from collections import Counter
def find_missing(str):
stack1 = []
stack2 = []
result = []
res_dict = {}
open_set = '<[{('
closed_set = '>]})'
a = list(str)
for i in a:
if i in open_set:
stack1.append(i)
elif i in closed_set:
stack2.append(i)
dict1 = Counter(stack1)
dict2 = Counter(stack2)
print(dict1)
print(dict2)
for i in open_set:
if dict1[i] > dict2[closed_set[open_set.index(i)]]:
res_dict[closed_set[open_set.index(i)]] = dict1[i] - dict2[closed_set[open_set.index(i)]]
result.append(closed_set[open_set.index(i)])
for i in closed_set:
if dict2[i] > dict1[open_set[closed_set.index(i)]]:
res_dict[open_set[closed_set.index(i)]] = dict2[i] - dict1[open_set[closed_set.index(i)]]
result.append(open_set[closed_set.index(i)])
return res_dict
# return result
if __name__ == '__main__':
str1 = '{This ((()bracket {[function]} <<going> crazy}'
x = find_missing(str1)
if len(x) > 0:
print("Imbalanced")
print(x)
else:
print("Balanced")
First we will scan the string from left to right, and every time we see an opening parenthesis we push it to a stack, because we want the last opening parenthesis to be closed first. (Remember the FILO structure of a stack!)
Then, when we see a closing parenthesis we check whether the last opened one is the corresponding closing match, by popping an element from the stack. If it’s a valid match, then we proceed forward, if not return false.
Code:
https://gist.github.com/i143code/51962bfb1bd5925f75007d4dcbcf7f55
I needed something for a recent project and figured I could build on the OP's solution a bit. It allows for comment patterns, quotes and brackets to be checked, whilst ignoring the surrounding text. I've purposefully made it more generic than it needs to be so that others can take what they want and cut out what they don't.
"""
This module is for testing bracket pairings within a given string
Tested with Python 3.5.4
>>> regexp = getRegexFromList(opening + closing)
>>> print(regexp)
(\\<\\-\\-|\\-\\-\\>|\\/\\*|\\/\\/|\\*\\/|\\#|\\"|\\'|\\(|\\[|\\{|\\<|\\\n|\\\n|\\"|\\'|\\)|\\]|\\}|\\>)
>>> test_string = 'l<--([0])-->1/*{<2>}*/3//<--4 &-->\\n5#"6"\\n7"/*(8)*/"9\'"10"\'11({12\ta})13[<14>]'
>>> patterns = re.findall(regexp, test_string)
>>> print(patterns)
['<--', '(', '[', ']', ')', '-->', '/*', '{', '<', '>', '}', '*/', '//', '<--', '-->', '\\n', '#', '"', '"', '\\n', '"', '/*', '(', ')', '*/', '"', '(', '{', '}', ')', '[', '<', '>', ']']
>>> doBracketsMatch(patterns)
True
>>> doBracketsMatch(['"', ')', '"', '[', ']', '\\''])
False
"""
# Dependencies
import re
# Global Variables
# Provide opening and closing patterns, along with their priorities & whether a priority is nestable
opening = ['<--', '/*', '//', '#', '"', '\'', '(', '[', '{', '<']
closing = ['-->', '*/', '\n', '\n', '"', '\'', ')', ']', '}', '>']
priority = [ 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
nestable = {0: True, 1: False}
bracket_pairs = dict(zip(opening + closing, \
[[(closing + opening)[i], (priority + priority)[i]] \
for i in range(0, opening.__len__() * 2)]))
def getRegexFromList(listOfPatterns):
"""
Generate the search term for the regular expression
:param listOfPatterns:
:return:
>>> getRegexFromList(['"', '<--', '##', 'test'])
'(\\\\t\\\\e\\\\s\\\\t|\\\\<\\\\-\\\\-|\\\\#\\\\#|\\\\")'
"""
# Longer patterns first to prevent false negatives
search_terms = sorted(listOfPatterns, key=len, reverse=True)
regex = ""
for term in search_terms:
for char in str(term):
regex = regex + '\\' + char # Search for all characters literally
regex = regex + '|' # Search pattern = (a|b|c)
return '(' + regex[:-1] + ')' # Remove excess '|' and add brackets
def doBracketsMatch(list_of_brackets):
"""
Determine if brackets match up
:param list_of_brackets:
:return:
"""
stack = []
for bracket in list_of_brackets:
# Check empty stack conditions
if stack.__len__() is 0:
# Check for openings first to catch quotes
if bracket in opening:
stack.append(bracket)
elif bracket in closing:
return False
else:
continue
# Check for a matching bracket
elif bracket == bracket_pairs[stack[-1]][0]:
stack.pop()
# Ignore cases:
# - False positives
# - Lower priority brackets
# - Equal priority brackets if nesting is not allowed
elif bracket not in bracket_pairs or \
bracket_pairs[bracket][1] < bracket_pairs[stack[-1]][1] or \
(bracket_pairs[bracket][1] == bracket_pairs[stack[-1]][1] and \
not nestable[bracket_pairs[bracket][1]]):
continue
# New open bracket
elif bracket in opening:
stack.append(bracket)
# Otherwise, unpaired close bracket
else:
return False
# If stack isn't empty, then there is an unpaired open bracket
return not bool(stack)
if __name__ == '__main__':
import doctest
doctest.testmod()

Categories