replace semicolon by newline in python code - python

I would like to parse Python code that contains semicolons ; for separating commands and produce code that replaces those by newlines \n. E.g., from
def main():
a = "a;b"; return a
I'd like to produce
def main():
a = "a;b"
return a
Any hints?

Use the tokenize library to look for token.OP tokens, where the second element is a ; *. Replace these tokens with a token.NEWLINE token.
You'd need to adjust your token offsets and generate matching indent too however; so after a NEWLINE you'd need to adjust line numbers (increment by an offset you increase for every NEWLINE you insert) and the 'next' line (remainder of the current line) would have to have the indices adjusted to match the current indentation level:
import tokenize
TokenInfo = getattr(tokenize, 'TokenInfo', lambda *a: a) # Python 3 compat
def semicolon_to_newline(tokens):
line_offset = 0
last_indent = None
col_offset = None # None or an integer
for ttype, tstr, (slno, scol), (elno, ecol), line in tokens:
slno, elno = slno + line_offset, elno + line_offset
if ttype in (tokenize.INDENT, tokenize.DEDENT):
last_indent = ecol # block is indented to this column
elif ttype == tokenize.OP and tstr == ';':
# swap out semicolon with a newline
ttype = tokenize.NEWLINE
tstr = '\n'
line_offset += 1
if col_offset is not None:
scol, ecol = scol - col_offset, ecol - col_offset
col_offset = 0 # next tokens should start at the current indent
elif col_offset is not None:
if not col_offset:
# adjust column by starting column of next token
col_offset = scol - last_indent
scol, ecol = scol - col_offset, ecol - col_offset
if ttype == tokenize.NEWLINE:
col_offset = None
yield TokenInfo(
ttype, tstr, (slno, scol), (elno, ecol), line)
with open(sourcefile, 'r') as source, open(destination, 'w') as dest:
generator = tokenize.generate_tokens(source.readline)
dest.write(tokenize.untokenize(semicolon_to_newline(generator)))
Note that I don't bother to correct the line value; it is informative only, the data that was read from the file is not actually used when un-tokenizing.
Demo:
>>> from io import StringIO
>>> source = StringIO('''\
... def main():
... a = "a;b"; return a
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
def main():
a = "a;b"
return a
and slightly more complex:
>>> source = StringIO('''\
... class Foo(object):
... def bar(self):
... a = 10; b = 11; c = 12
... if self.spam:
... x = 12; return x
... x = 15; return y
...
... def baz(self):
... return self.bar;
... # note, nothing after the semicolon
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
class Foo(object):
def bar(self):
a = 10
b = 11
c = 12
if self.spam:
x = 12
return x
x = 15
return y
def baz(self):
return self.bar
# note, nothing after the semicolon
>>> print(result.replace(' ', '.'))
class.Foo(object):
....def.bar(self):
........a.=.10
........b.=.11
........c.=.12
........if.self.spam:
............x.=.12
............return.x
........x.=.15
........return.y
....def.baz(self):
........return.self.bar
........
........#.note,.nothing.after.the.semicolon
* The Python 3 version of tokenize outputs more informative TokenInfo named tuples, which have an extra exact_type attribute that can be used instead of doing a text match: tok.exact_type == tokenize.SEMI. I kept the above compatible with Python 2 and 3 however.

Here's a pyparsing solution - see comments in the code below:
from pyparsing import Literal, restOfLine, quotedString, pythonStyleComment, line
SEMI = Literal(';')
patt = SEMI + restOfLine
patt.ignore(quotedString)
patt.ignore(pythonStyleComment)
def split_at(s, locs):
"""
break up s into pieces, given list of break locations
"""
current = 0
ret = []
for loc in locs:
ret.append(s[current:loc].lstrip())
current = loc+1
ret.append(s[current:].lstrip())
return ret
def split_on_semicolon(s,l,tokens):
"""
parse time callback, when finding first unquoted ';' on a line
"""
current_line = line(l,s)
line_body = current_line.lstrip()
indent = current_line.index(line_body)
indent = current_line[:indent]
# may be more than one ';' on this line, find them all
# (the second token contains everything after the ';')
remainder = tokens[1]
if remainder.strip():
all_semis = [s for _,s,_ in SEMI.scanString(remainder)]
# break line into pieces
pieces = split_at(remainder, all_semis)
# rejoin pieces, with leading indents
return '\n'+'\n'.join(indent+piece for piece in pieces)
else:
return ''
patt.addParseAction(split_on_semicolon)
sample = """
def main():
this_semi_does_nothing();
neither_does_this_but_there_are_spaces_afterward();
a = "a;b"; return a # this is a comment; it has a semicolon!
def b():
if False:
z=1000;b("; in quotes"); c=200;return z
return ';'
class Foo(object):
def bar(self):
'''a docstring; with a semicolon'''
a = 10; b = 11; c = 12
# this comment; has several; semicolons
if self.spam:
x = 12; return x # so; does; this; one
x = 15;;; y += x; return y
def baz(self):
return self.bar
"""
print(patt.transformString(sample))
Gives:
def main():
this_semi_does_nothing()
neither_does_this_but_there_are_spaces_afterward()
a = "a;b"
return a # this is a comment; it has a semicolon!
def b():
if False:
z=1000
b("; in quotes")
c=200
return z
return ';'
class Foo(object):
def bar(self):
'''a docstring; with a semicolon'''
a = 10
b = 11
c = 12
# this comment; has several; semicolons
if self.spam:
x = 12
return x # so; does; this; one
x = 15
y += x
return y
def baz(self):
return self.bar

Related

How to create a cyclic iterator over the alphabet in Python?

Given the following scenario:
import string
UPPERCASE_ALPHABET = list(string.ascii_uppercase)
LOWERCASE_ALPHABET = list(string.ascii_lowercase)
How to create a cyclic loop over the alphabet jumping N positions?
Example 1:
letter = a, jump = 5
Result: f
Example 2:
letter = z, jump = 5
Result: e
So far, I got:
import string
UPPERCASE_ALPHABET = list(string.ascii_uppercase)
LOWERCASE_ALPHABET = list(string.ascii_lowercase)
def forward(letter, jump):
alphabet = LOWERCASE_ALPHABET if letter.islower() else UPPERCASE_ALPHABET
index = alphabet.index(letter)
count = 0
while True:
if count == jump:
return alphabet[index]
if index == len(alphabet):
index = 0
index += 1
count += 1
print forward('a', 5)
print forward('z', 5)
But it doesn't look Pythonic at all...
Is there a better and Pythonic way of doing this? Maybe using chr(ord('N') + position) ?
I think you had the right idea with ord and chr:
import string
def forward(letter, jump):
if letter.islower():
start_character = ord('a')
else:
start_character = ord('A')
start = ord(letter) - start_character
offset = ((start + jump) % 26) + start_character
result = chr(offset)
return result
print forward('a', 5)
print forward('z', 5)
print forward('z', 1)
print forward('a', 26)
print forward('A', 5)
print forward('Z', 5)
print forward('Z', 1)
print forward('A', 26)
Output
f
e
a
a
F
E
A
A
I'd write a custom iterator class to encapsulate itertools.cycle() and provide a skip() functionality, e.g.:
import itertools
class CyclicSkipIterator(object):
def __init__(self, iterable):
self._iterator = itertools.cycle(iterable)
def __iter__(self):
return self
def next(self): # use __next__ on Python 3.x
return next(self._iterator)
def skip(self, number=1):
for i in xrange(number): # use range() on Python 3.x
next(self._iterator)
Then you can do exactly what you wanted with it:
import string
LOWERCASE_ALPHABET = list(string.ascii_lowercase)
lower_iter = CyclicSkipIterator(LOWERCASE_ALPHABET)
print(next(lower_iter)) # a
lower_iter.skip(4) # skip next 4 letters: b, c, d, e
print(next(lower_iter)) # f
lower_iter.skip(19) # skip another 19 letters to arrive at z
print(next(lower_iter)) # z
lower_iter.skip(4) # skip next 4 letters: a, b, c, d
print(next(lower_iter)) # e
You can add even more functionality if you wanted to, like reversing, switching iterables mid-iteration etc.
UPDATE: If you want to jump to a specific element in the list, you can add a method for that to the CyclicSkipIterator:
class CyclicSkipIterator(object):
def __init__(self, iterable):
self._iterator = itertools.cycle(iterable)
def __iter__(self):
return self
def __next__(self): # use __next__ on Python 3.x
return next(self._iterator)
def skip(self, number=1):
for _ in range(number): # use range() on Python 3.x
next(self._iterator)
def skip_to(self, element, max_count=100): # max_count protects against endless cycling
max_count = max(1, max_count) # ensure at least one iteration
for _ in range(max_count): # use range() on Python 3.x
e = next(self._iterator)
if element == e:
break
Then you can skip_to whatever letter you want:
import string
LOWERCASE_ALPHABET = list(string.ascii_lowercase)
lower_iter = CyclicSkipIterator(LOWERCASE_ALPHABET)
print(next(lower_iter)) # a
lower_iter.skip(4) # skip 4 letters: b, c, d, e
print(next(lower_iter)) # f
lower_iter.skip_to("y") # skip all letters up to y
print(next(lower_iter)) # z
lower_iter.skip(4) # skip 4 letters: a, b, c, d
print(next(lower_iter)) # e
class CyclicIterator:
def __init__(self,lst):
self.lst=lst
self.i=0
def __iter__(self):
return self
def __next__(self):
result=self.lst[self.i % len(self.lst)]
self.i+=3 #increasing by 3
return result
class that has iter() and next() meets iterator protocol. creating an instance of this iterator class
iter_cycle=CyclicIterator('abcdefghiijklmnnoprstuvyz')
numbers=range(1,27,3) # 26 letters increases by 3
list(zip(list(numbers),iter_cycle))

Instances of class polynomial

How can I get coefficients to list from three different ways of creating new instances of class Polynomial?
class Polynomial(object)
def __init__(self,*args)
self.coeffs=[]
...
pol1 = Polynomial([1,-3,0,2])
pol2 = Polynomial(1,-3,0,2)
pol3 = Polynomial(x0=1,x3=2­,x1=-3)
I am expecting for example: pol2 = Polynomial(1,-3,0,2), output is 2x^3-3x+1. But I need to get coefficients to list to work with them.
Assuming, that one of the three ways is always used, you can do the following (without any validation):
class Polynomial(object):
def __init__(self, *args, **kwargs):
if args and isinstance(args[0], list): # Polynomial([1,-3,0,2])
self.coeffs=args[0]
elif args: # Polynomial(1,-3,0,2)
self.coeffs=args
else: # Polynomial(x0=1,x3=2­,x1=-3)
self.coeffs=[kwargs.get(x, 0) for x in ('x0', 'x1', 'x2', 'x3')]
def __str__(self):
s = ''
for i, x in reversed(list(enumerate(self.coeffs))):
if x:
if x > 0:
s += '+'
s += str(x)
if i > 0:
s += 'x'
if i > 1:
s += '^' + str(i)
return '0' if not s else s.lstrip('+')
pol1 = Polynomial([1,-3,0,2])
pol2 = Polynomial(1,-3,0,2)
pol3 = Polynomial(x0=1, x1=-3, x3=2)
print(pol1) # 2x^3-3x+1
print(pol2) # 2x^3-3x+1
print(pol3) # 2x^3-3x+1
In addition to schwobaseggl's response, I'd add this kind of checking:
if type(args[0]) == list:
self.coeffs=args
# ...
else:
self.coeffs=[kwargs.get(x, 0) for x in ['x'+i for i in range(len(kwargs))]]

Why does this character ▯ appear?

So this character ▯ appears when I run my code which I think means there is a missing character therefor it can't be displayed. (Not sure correct me if I am wrong) And well basically I want to be able to get rid of that character. Here is what it looks like when I run my code:
However in the back-end in the idle when I click on one of the boxes for it to be displayed up top it doesn't register and looks like this in idle:
Why does it appear on screen if it isn't going to appear in idle?
Also how can I get rid of the ▯ character from the main screen?
Here is my full code.
Here are segments in which I think the problem lies. (However I have not been able to solve the problem)
My classes for Tree comparison to find the sentences and their frequent use:
class Branch():
def __init__(self, value):
self.left = None
self.right = None
self.value = value
self.frequency = 1
def incFreq(self):
self.frequency = self.frequency + 1
def freq(self):
return self.frequency
class Tree():
highest = []
def __init__(self):
self.root = None
self.found = False
def findHighest(self):
from operator import itemgetter, attrgetter
self.highest = []
self.inorder(self.root)
self.highest = sorted(self.highest, key=itemgetter(1), reverse=True)
return self.highest
#lessThan function needed to compare strings
def lessThan(self, a, b):
if len(a) < len(b):
loopCount = len(a)
else:
loopCount = len(b)
for pos in range(0, loopCount):
if a[pos] > b[pos]:
return False
return True
def outputTree(self):
self.inorder(self.root)
def insert(self, value):
#increment freq if already exists, else insert
if not self.exists(value):
self.root = self.insertAtBranch(self.root, value)
def exists(self, value):
#set the class variable found to False to assume it is not there
self.found = False
self.findAtBranch(self.root, value)
return self.found
#Used to fine a value in a tree
def findAtBranch(self, branch, value):
if branch == None:
pass
else:
#print ("[" + branch.value + "][" + value + "]") # Error checking
if branch.value == value:
self.found = True
#print("found " + value)
branch.incFreq()
#print(branch.freq())
else:
self.findAtBranch(branch.left, value)
self.findAtBranch(branch.right, value)
def insertAtBranch(self, branch, value):
if branch == None:
return Branch(value)
else:
if self.lessThan(branch.value, value):
branch.right = self.insertAtBranch(branch.right, value)
else:
branch.left = self.insertAtBranch(branch.left, value)
return branch
def inorder(self, branch):
if branch == None: return
self.highest.append((branch.value, branch.freq()))
#print (branch.value)
#print (branch.freq())
#print(self.highest[0])
self.inorder(branch.left)
self.inorder(branch.right)
This is where I use the tree and pass sentences to be used on a different function:
def getPhrases(self, numToReturn):
topPhrases = []
phrasesTree = Tree()
#load tree with phrases from phrase text file
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line)
#create a list of the top n of phrases to return
val = 0
for phrase in phrasesTree.findHighest():
if val < numToReturn:
topPhrases.append(phrase)
val = val + 1
return topPhrases
This is where I use the sentences to be able to display them on the screen:
def createPhrases(self):
print("createPhrases")
self.deletePanes()
self.show_keyboard = False
self.show_words = False
self.show_phrases = True
self.show_terminal = True
words = self.getPhrases(10)
for word, count in words:
self.addPane("{}".format(word, count), WORDS)
self.addPane("Boxes", PHRASE)
self.addPane("Keyboard", PHRASE)
self.addPane("OK", PHRASE)
self.drawPanes()
When you read lines from file, newline characters are at the end. pygame's documentation states that:
The text can only be a single line: newline characters are not rendered.
So, you should change this fragment:
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line)
to this:
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line.strip())

Python compiler for simple language to java vm code algorithm

I have a simple language that I am trying to write a compiler for (yes it is homework) to compile a simple language I shall describe if necessary to java vm code.
It currently works pretty well I've just hit a bump with logical AND's and OR's.
Each work fine in a single if/while condition, but if I try and chain them things go wrong, correct me if I am wrong but I believe that AND has precedence, but I was wondering if there are logical ways of arranging them? I think is what I'm trying to ask, the java vm code output just has the compare and jump statements one after the other (which seems wrong). I realise it's quite abstract so maybe what I'm after is a pseudo code/algorithm for how to structure chained AND's and OR's.
EDIT: Currently just treats any combination of AND and OR as AND's. Comparing the factor/term/expression connection (compared to booleanfactor etc) I believe that AND has precedence? Just a thought.
Apologies if this is poorly understood :/
So i figure ill include relevant info just incase.
compiler
import re
import sys
# Restrictions:
# Integer constants must be short.
# Stack size must not exceed 1024.
# Integer is the only type.
# Logical operators cannot be nested.
class Scanner:
'''The interface comprises the methods lookahead and consume.
Other methods should not be called from outside of this class.'''
def __init__(self, input_file):
'''Reads the whole input_file to input_string.'''
# source code of the program to be compiled
self.input_string = input_file.read()
# index where the unprocessed part of input_string starts
self.current_char_index = 0
# a pair (most recently read token, matched substring of input_string)
self.current_token = self.get_token()
def skip_white_space(self):
'''Consumes all characters in input_string up to the next
non-white-space character.'''
if (self.current_char_index >= len(self.input_string) - 1):
# bad fix for it over-running the end of the file
return
while self.input_string[self.current_char_index].isspace():
self.current_char_index += 1
return
def get_token(self):
'''Returns the next token and the part of input_string it matched.
Returns None if there is no next token.
The characters up to the end of the token are consumed.'''
self.skip_white_space()
# find the longest prefix of input_string that matches a token
token, longest = None, ''
for (t, r) in Token.token_regexp:
match = re.match(r, self.input_string[self.current_char_index:])
if match and match.end() > len(longest):
token, longest = t, match.group()
# consume the token by moving the index to the end of the matched part
self.current_char_index += len(longest)
return (token, longest)
def lookahead(self):
'''Returns the next token without consuming it.
Returns None if there is no next token.'''
return self.current_token[0]
def consume(self, *tokens):
'''Returns the next token and consumes it, if it is in tokens.
Raises an exception otherwise.
If the token is a number or an identifier, its value is returned.'''
if self.current_token[0] not in tokens:
print('Token ' + self.current_token[0] + ' isn\'t in the tokens: ')
for token in tokens:
print(token)
raise Exception('Token is not in tokens this shouldn\'t happen much')
if self.current_token[0] == 'ID':
symbol_table.location(self.current_token[1])
value = self.current_token[1]
elif (self.current_token[0] == 'NUM'):
value = self.current_token[1]
else:
value = self.current_token[0]
self.current_token = self.get_token()
return value
class Token:
DO = 'DO';
ELSE = 'ELSE';
END = 'END';
IF = 'IF';
THEN = 'THEN';
WHILE = 'WHILE';
SEM = 'SEM';
BEC = 'BEC';
LESS = 'LESS';
EQ = 'EQ';
GRTR = 'GRTR';
LEQ = 'LEQ';
NEQ = 'NEQ';
GEQ = 'GEQ';
ADD = 'ADD';
SUB = 'SUB';
MUL = 'MUL';
DIV = 'DIV';
LPAR = 'LPAR';
RPAR = 'RPAR';
NUM = 'NUM';
ID = 'ID';
READ = 'READ';
WRITE = 'WRITE';
OR = 'OR';
AND = 'AND';
NOT = 'NOT';
# The following list gives the regular expression to match a token.
# The order in the list matters for mimicking Flex behaviour.
# Longer matches are preferred over shorter ones.
# For same-length matches, the first in the list is preferred.
token_regexp = [
(DO, 'do'),
(ELSE, 'else'),
(END, 'end'),
(IF, 'if'),
(THEN, 'then'),
(WHILE, 'while'),
(READ, 'read'),
(WRITE, 'write'),
(OR, 'or'),
(AND, 'and'),
(NOT, 'not'),
(SEM, ';'),
(BEC, ':='),
(LESS, '<'),
(EQ, '='),
(NEQ, '!='),
(GRTR, '>'),
(LEQ, '<='),
(GEQ, '>='),
(ADD, '[+]'), # + is special in regular expressions
(SUB, '-'),
(MUL, '[*]'),
(DIV, '/'),
(LPAR, '[(]'), # ( is special in regular expressions
(RPAR, '[)]'), # ) is special in regular expressions
(ID, '[a-z]+'),
(NUM, '[0-9]+'),
]
class Symbol_Table:
'''A symbol table maps identifiers to locations.'''
def __init__(self):
self.symbol_table = {}
def size(self):
'''Returns the number of entries in the symbol table.'''
return len(self.symbol_table)
def location(self, identifier):
'''Returns the location of an identifier. If the identifier is not in
the symbol table, it is entered with a new location. Locations are
numbered sequentially starting with 0.'''
if identifier in self.symbol_table:
return self.symbol_table[identifier]
index = len(self.symbol_table)
self.symbol_table[identifier] = index
return index
class Label:
def __init__(self):
self.current_label = 0
def next(self):
'''Returns a new, unique label.'''
self.current_label += 1
return 'l' + str(self.current_label)
def indent(s, level):
return ' '*level + s + '\n'
# Each of the following classes is a kind of node in the abstract syntax tree.
# indented(level) returns a string that shows the tree levels by indentation.
# code() returns a string with JVM bytecode implementing the tree fragment.
# true_code/false_code(label) jumps to label if the condition is/is not true.
# Execution of the generated code leaves the value of expressions on the stack.
class Program_AST:
def __init__(self, program):
self.program = program
def __repr__(self):
return repr(self.program)
def indented(self, level):
return self.program.indented(level)
def code(self):
program = self.program.code()
local = symbol_table.size()
java_scanner = symbol_table.location('Java Scanner')
return '.class public Program\n' + \
'.super java/lang/Object\n' + \
'.method public <init>()V\n' + \
'aload_0\n' + \
'invokenonvirtual java/lang/Object/<init>()V\n' + \
'return\n' + \
'.end method\n' + \
'.method public static main([Ljava/lang/String;)V\n' + \
'.limit locals ' + str(local) + '\n' + \
'.limit stack 1024\n' + \
'new java/util/Scanner\n' + \
'dup\n' + \
'getstatic java/lang/System.in Ljava/io/InputStream;\n' + \
'invokespecial java/util/Scanner.<init>(Ljava/io/InputStream;)V\n' + \
'astore ' + str(java_scanner) + '\n' + \
program + \
'return\n' + \
'.end method\n'
class Statements_AST:
def __init__(self, statements):
self.statements = statements
def __repr__(self):
result = repr(self.statements[0])
for st in self.statements[1:]:
result += '; ' + repr(st)
return result
def indented(self, level):
result = indent('Statement(s)', level)
for st in self.statements:
result += st.indented(level+1)
return result
def code(self):
result = ''
for st in self.statements:
result += st.code()
return result
class If_AST:
def __init__(self, boolean_expression, then):
self.boolean_expression = boolean_expression
self.then = then
def __repr__(self):
return 'if ' + repr(self.boolean_expression) + ' then ' + \
repr(self.then) + ' end'
def indented(self, level):
return indent('If-Then', level) + \
self.boolean_expression.indented(level+1) + \
self.then.indented(level+1)
def code(self):
l1 = label_generator.next()
return self.boolean_expression.code(l1) + \
self.then.code() + \
l1 + ':\n'
class If_Else_AST:
def __init__(self, boolean_expression, then, _else):
self.boolean_expression = boolean_expression;
self.then = then;
self._else = _else;
def __repr__(self):
return 'if ' + repr(self.boolean_expression) + ' then ' + \
repr(self.then) + ' else ' + \
repr(self._else) + ' end'
def indented(self, level):
return indent('If-Then-Else', level) + \
self.boolean_expression.indented(level+1) + \
self.then.indented(level+1) + \
indent('Else', level+1) + \
self._else.indented(level+1)
def code(self):
l1 = label_generator.next()
l2 = label_generator.next()
return self.boolean_expression.code(l1) + \
self.then.code() + \
'goto ' + l2 + '\n' + \
l1 + ':\n' + \
self._else.code() + \
l2 + ':\n'
class While_AST:
def __init__(self, boolean_term, body):
self.boolean_term = boolean_term
self.body = body
def __repr__(self):
return 'while ' + repr(self.boolean_term) + ' do ' + \
repr(self.body) + ' end'
def indented(self, level):
return indent('While-Do', level) + \
self.boolean_term.indented(level+1) + \
self.body.indented(level+2)
def code(self):
l1 = label_generator.next()
l2 = label_generator.next()
return l1 + ':\n' + \
self.boolean_term.code(l2) + \
self.body.code() + \
'goto ' + l1 + '\n' + \
l2 + ':\n'
class Assign_AST:
def __init__(self, identifier, expression):
self.identifier = identifier
self.expression = expression
def __repr__(self):
return repr(self.identifier) + ':=' + repr(self.expression)
def indented(self, level):
return indent('Assign', level) + \
self.identifier.indented(level+1) + \
self.expression.indented(level+1)
def code(self):
loc = symbol_table.location(self.identifier.identifier)
return self.expression.code() + \
'istore ' + str(loc) + '\n'
class Write_AST:
def __init__(self, expression):
self.expression = expression
def __repr__(self):
return 'write ' + repr(self.expression)
def indented(self, level):
return indent('Write', level) + self.expression.indented(level+1)
def code(self):
return 'getstatic java/lang/System/out Ljava/io/PrintStream;\n' + \
self.expression.code() + \
'invokestatic java/lang/String/valueOf(I)Ljava/lang/String;\n' + \
'invokevirtual java/io/PrintStream/println(Ljava/lang/String;)V\n'
class Read_AST:
def __init__(self, identifier):
self.identifier = identifier
def __repr__(self):
return 'read ' + repr(self.identifier)
def indented(self, level):
return indent('Read', level) + self.identifier.indented(level+1)
def code(self):
java_scanner = symbol_table.location('Java Scanner')
loc = symbol_table.location(self.identifier.identifier)
return 'aload ' + str(java_scanner) + '\n' + \
'invokevirtual java/util/Scanner.nextInt()I\n' + \
'istore ' + str(loc) + '\n'
class Comparison_AST:
def __init__(self, left, op, right):
self.left = left
self.op = op
self.right = right
def __repr__(self):
op = { Token.LESS:'<', Token.EQ:'=', Token.GRTR:'>',
Token.LEQ:'<=', Token.NEQ:'!=', Token.GEQ:'>=' }
return repr(self.left) + op[self.op] + repr(self.right)
def indented(self, level):
return indent(self.op, level) + \
self.left.indented(level+1) + \
self.right.indented(level+1)
def true_code(self, label):
op = { Token.LESS:'if_icmplt', Token.EQ:'if_icmpeq',
Token.GRTR:'if_icmpgt', Token.LEQ:'if_icmple',
Token.NEQ:'if_icmpne', Token.GEQ:'if_icmpge' }
return self.left.code() + \
self.right.code() + \
op[self.op] + ' ' + label + '\n'
def false_code(self, label):
# Negate each comparison because of jump to "false" label.
op = { Token.LESS:'if_icmpge', Token.EQ:'if_icmpne',
Token.GRTR:'if_icmple', Token.LEQ:'if_icmpgt',
Token.NEQ:'if_icmpeq', Token.GEQ:'if_icmplt' }
return self.left.code() + \
self.right.code() + \
op[self.op] + ' ' + label + '\n'
class Expression_AST:
def __init__(self, left, op, right):
self.left = left
self.op = op
self.right = right
def __repr__(self):
op = { Token.ADD:'+', Token.SUB:'-', Token.MUL:'*', Token.DIV:'/' }
return '(' + repr(self.left) + op[self.op] + repr(self.right) + ')'
def indented(self, level):
return indent(self.op, level) + \
self.left.indented(level+1) + \
self.right.indented(level+1)
def code(self):
op = { Token.ADD:'iadd', Token.SUB:'isub',
Token.MUL:'imul', Token.DIV:'idiv' }
return self.left.code() + \
self.right.code() + \
op[self.op] + '\n'
class Number_AST:
def __init__(self, number):
self.number = number
def __repr__(self):
return self.number
def indented(self, level):
return indent(self.number, level)
def code(self): # works only for short numbers
return 'sipush ' + self.number + '\n'
class Identifier_AST:
def __init__(self, identifier):
self.identifier = identifier
def __repr__(self):
return self.identifier
def indented(self, level):
return indent(self.identifier, level)
def code(self):
loc = symbol_table.location(self.identifier)
return 'iload ' + str(loc) + '\n'
class BooleanFactor_AST:
def __init__(self, condition, logic):
self.condition = condition
self.logic = logic
def __repr__(self):
if self.logic == False:
return 'NOT ' + repr(self.condition)
else:
return repr(self.condition)
def indented(self, level):
if self.logic == False:
return indent('NOT ', level) + self.condition.indented(level + 1)
else:
return self.condition.indented(level)
def false_code(self, label):
if self.logic == True:
return self.condition.false_code(label)
else:
return self.condition.true_code(label)
return
def true_code(self, label):
if self.logic == True:
return self.condition.true_code(label)
else:
return self.condition.false_code(label)
class BooleanTerm_AST:
def __init__(self, terms):
self.terms = terms
def __repr__(self):
result = repr(self.terms[0])
for term in self.terms[1:]:
result = result + ' AND ' + repr(term)
return result
def indented(self, level):
result = self.terms[0].indented(level)
for term in self.terms[1:]:
result = result + indent('AND', level)
result = result + term.indented(level)
return result
def code(self, label):
result = ''
for term in self.terms:
result = result + term.false_code(label)
return result
class BooleanExpression_AST:
def __init__(self, expressions):
self.expressions = expressions
def __repr__(self):
result = repr(self.expressions[0])
for expression in self.expressions[1:]:
result = result + ' OR ' + repr(expression)
return result
def indented(self, level):
result = self.expressions[0].indented(level)
indentation = 0
for expression in self.expressions[1:]:
indentation += 1
result = result + indent('OR', level + indentation)
result = result + expression.indented(level + indentation)
return result
def code(self, label):
result = ''
for expression in self.expressions:
result = result + expression.code(label)
return result
# The following methods comprise the recursive-descent parser.
def program():
sts = statements()
return Program_AST(sts)
def statements():
result = [statement()]
while scanner.lookahead() == Token.SEM:
scanner.consume(Token.SEM)
st = statement()
result.append(st)
return Statements_AST(result)
def statement():
if scanner.lookahead() == Token.IF:
return if_statement()
elif scanner.lookahead() == Token.WHILE:
return while_statement()
elif scanner.lookahead() == Token.ID:
return assignment()
elif scanner.lookahead() == Token.READ:
return read();
elif scanner.lookahead() == Token.WRITE:
return write();
else: # error
return scanner.consume(Token.IF, Token.WHILE, Token.ID)
def if_statement():
scanner.consume(Token.IF)
condition = boolean_expression()
scanner.consume(Token.THEN)
then = statements()
if scanner.lookahead() == Token.END:
scanner.consume(Token.END)
return If_AST(condition, then)
else:
scanner.consume(Token.ELSE)
_else = statements()
scanner.consume(Token.END)
return If_Else_AST(condition, then, _else)
def while_statement():
scanner.consume(Token.WHILE)
condition = boolean_expression()
scanner.consume(Token.DO)
body = statements()
scanner.consume(Token.END)
return While_AST(condition, body)
def assignment():
ident = identifier()
scanner.consume(Token.BEC)
expr = expression()
return Assign_AST(ident, expr)
def read():
scanner.consume(Token.READ)
variable = identifier()
return Read_AST(variable)
def write():
scanner.consume(Token.WRITE)
expr = expression()
return Write_AST(expr)
def comparison():
left = expression()
op = scanner.consume(Token.LESS, Token.EQ, Token.GRTR,
Token.LEQ, Token.NEQ, Token.GEQ)
right = expression()
return Comparison_AST(left, op, right)
def expression():
result = term()
while scanner.lookahead() in [Token.ADD, Token.SUB]:
op = scanner.consume(Token.ADD, Token.SUB)
tree = term()
result = Expression_AST(result, op, tree)
return result
def term():
result = factor()
while scanner.lookahead() in [Token.MUL, Token.DIV]:
op = scanner.consume(Token.MUL, Token.DIV)
tree = factor()
result = Expression_AST(result, op, tree)
return result
def factor():
if scanner.lookahead() == Token.LPAR:
scanner.consume(Token.LPAR)
result = expression()
scanner.consume(Token.RPAR)
return result
elif scanner.lookahead() == Token.NUM:
value = scanner.consume(Token.NUM)
return Number_AST(value)
elif scanner.lookahead() == Token.ID:
return identifier()
else: # error
return scanner.consume(Token.LPAR, Token.NUM, Token.ID)
def identifier():
value = scanner.consume(Token.ID)
return Identifier_AST(value)
def boolean_factor():
if scanner.lookahead() == Token.NOT:
scanner.consume(Token.NOT)
logic = False
else:
logic = True
result = comparison()
return BooleanFactor_AST(result, logic)
def boolean_term():
result = [boolean_factor()]
while scanner.lookahead() in [Token.AND]:
scanner.consume(scanner.lookahead())
temp = boolean_factor()
result.append(temp)
return BooleanTerm_AST(result)
def boolean_expression():
result = [boolean_term()]
while scanner.lookahead() in [Token.OR]:
scanner.consume(scanner.lookahead())
temp = boolean_term()
result.append(temp)
return BooleanExpression_AST(result)
# Initialise scanner, symbol table and label generator.
#scanner = Scanner(open('test.txt'))
scanner = Scanner(sys.stdin)
symbol_table = Symbol_Table()
symbol_table.location('Java Scanner') # fix a location for the Java Scanner
label_generator = Label()
# Uncomment the following to test the scanner without the parser.
# This shows a list of all tokens in the input.
#
#token = scanner.lookahead()
#while token != None:
# print(token)
# scanner.consume(token)
# token = scanner.lookahead()
#exit()
# Call the parser.
ast = program()
assert scanner.lookahead() == None
# Uncomment the following to test the parser without the code generator.
# The first line gives back the program by calling __repr__ of the AST classes.
# The second line shows the syntax tree with levels indicated by indentation.
#
#print(ast)
#print(ast.indented(0))
#exit()
# Call the code generator.
# This translates the abstract syntax tree to JVM bytecode.
# It can be assembled to a class file by Jasmin: http://jasmin.sourceforge.net/
print(ast.code())
testing bat file
python compiler.py <test.txt> Program.j
java -Xmx100m -jar jasmin.jar Program.j
java -Xmx100m Program < testInput.txt > test_output.txt
and language (BNF)
Program = Statements
Statements = Statement (; Statement)
Statement = If | While | Assignment
If = if Comparison then Statements end
While = while Comparison do Statements end
Assignment = identifier := Expression
Comparison = Expression Relation Expression
Relation = = | != | < | <= | > | >=
Expression = Term ((+ | -) Term)
Term = Factor ((* | /) Factor)
Factor = (Expression) | number | identifier
BooleanExpression = BooleanTerm (or BooleanTerm)*
BooleanTerm = BooleanFactor (and BooleanFactor)*
BooleanFactor = not BooleanFactor | Comparison
I think thats all that is relevant, cheers if you take a go at helping me on this
if you want a method to chain OR's and AND'syou can use this property:
p v q === ¬p ^ ¬q
Is equivalent, you can process all in the AND form. for example.
p v q ^ r v s === ¬p ^ ¬q ^ ¬r ^ ¬s
So evaluate the expression in AND form is simple with an algorithm.
I guess the expression doesn't have any parenthesis, in other way you need prioritize the grouping symbols (), [], {}.

python - how to replace all link tags in html with an <em> tag?

I have a routine that converts a long article string into a shorted summary in PHP but rather than do this at page render I'd like to store the summary in a database.
I have everything all sorted out except for replacing link tags with a <em>text</em>. I don't want to use beautiful soup for this, I'd rather some simple regex replace if possible, or perhaps another function of HTMLParser (I couldn't find anything after Googling for a while).
Here's what I have currently:
import HTMLParser, string, re
tag_end_re = re.compile(r'(\w+)[^>]*>')
entity_end_re = re.compile(r'(\w+;)')
class StrippingParser(HTMLParser.HTMLParser):
# These are the HTML tags that we will leave intact
valid_tags = ('a')
from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.result = ""
self.endTagList = []
def handle_data(self, data):
if data:
self.result = self.result + data
def handle_charref(self, name):
self.result = "%s&#%s;" % (self.result, name)
def handle_entityref(self, name):
if self.entitydefs.has_key(name):
x = ';'
else:
# this breaks unstandard entities that end with ';'
x = ''
self.result = "%s&%s%s" % (self.result, name, x)
def handle_starttag(self, tag, attrs):
""" Delete all tags except for legal ones """
if tag in self.valid_tags:
self.result = self.result + '<' + tag
for k, v in attrs:
if string.lower(k[0:2]) != 'on' and string.lower(v[0:10]) != 'javascript':
self.result = '%s %s="%s"' % (self.result, k, v)
endTag = '</%s>' % tag
self.endTagList.insert(0,endTag)
self.result = self.result + '>'
def handle_endtag(self, tag):
if tag in self.valid_tags:
self.result = "%s</%s>" % (self.result, tag)
remTag = '</%s>' % tag
self.endTagList.remove(remTag)
def cleanup(self):
""" Append missing closing tags """
for j in range(len(self.endTagList)):
self.result = self.result + self.endTagList[j]
def strip(s):
""" Strip illegal HTML tags from string s """
parser = StrippingParser()
parser.feed(s)
parser.close()
parser.cleanup()
return parser.result
def truncate_html(string, length, ellipsis='...'):
"""Truncate HTML string, preserving tag structure and character entities."""
length = int(length)
output_length = 0
i = 0
pending_close_tags = {}
while output_length < length and i < len(string):
c = string[i]
if c == '<':
# probably some kind of tag
if i in pending_close_tags:
# just pop and skip if it's closing tag we already knew about
i += len(pending_close_tags.pop(i))
else:
# else maybe add tag
i += 1
match = tag_end_re.match(string[i:])
if match:
tag = match.groups()[0]
i += match.end()
# save the end tag for possible later use if there is one
match = re.search(r'(</' + tag + '[^>]*>)', string[i:], re.IGNORECASE)
if match:
pending_close_tags[i + match.start()] = match.groups()[0]
else:
output_length += 1 # some kind of garbage, but count it in
elif c == '&':
# possible character entity, we need to skip it
i += 1
match = entity_end_re.match(string[i:])
if match:
i += match.end()
# this is either a weird character or just '&', both count as 1
output_length += 1
else:
# plain old characters
skip_to = string.find('<', i, i + length)
if skip_to == -1:
skip_to = string.find('&', i, i + length)
if skip_to == -1:
skip_to = i + length
# clamp
delta = min(skip_to - i,
length - output_length,
len(string) - i)
output_length += delta
i += delta
output = [string[:i]]
if output_length == length:
output.append(ellipsis)
for k in sorted(pending_close_tags.keys()):
output.append(pending_close_tags[k])
return "".join(output)
def summarize(contents,length):
summary = strip(contents)
summary = truncate_html(summary,length)
return summary
Calling summarize('my long text with links',400) currently returns a 400 character (excluding html tags) containing links. I need help replacing those links with tags.
You can use a simple regex for this:
import re
'<em>' + re.search(r'<a.+>(.+)</a>',r'my long text with links').group(1) + '</em>'

Categories