Reverse regular expression in Python - python

this is a strange question I know... I have a regular expression like:
rex = r"at (?P<hour>[0-2][0-9]) send email to (?P<name>\w*):? (?P<message>.+)"
so if I match that like this:
match = re.match(rex, "at 10 send email to bob: hi bob!")
match.groupdict() gives me this dict:
{"hour": "10", "name": "bob", "message": "hi bob!"}
My question is: given the dict above and rex, can I make a function that returns the original text? I know that many texts can match to the same dict (in this case the ':' after the name is optional) but I want one of the infinite texts that will match to the dict in input.

Using inverse_regex:
import itertools as IT
import sre_constants as sc
import sre_parse
import string
# Generate strings that match a given regex
category_chars = {
sc.CATEGORY_DIGIT : string.digits,
sc.CATEGORY_SPACE : string.whitespace,
sc.CATEGORY_WORD : string.digits + string.letters + '_'
def unique_extend(res_list, list):
for item in list:
if item not in res_list:
def handle_any(val):
This is different from normal regexp matching. It only matches
printable ASCII characters.
return string.printable
def handle_branch((tok, val)):
all_opts = []
for toks in val:
opts = permute_toks(toks)
unique_extend(all_opts, opts)
return all_opts
def handle_category(val):
return list(category_chars[val])
def handle_in(val):
out = []
for tok, val in val:
out += handle_tok(tok, val)
return out
def handle_literal(val):
return [chr(val)]
def handle_max_repeat((min, max, val)):
Handle a repeat token such as {x,y} or ?.
subtok, subval = val[0]
if max > 5000:
# max is the number of cartesian join operations needed to be
# carried out. More than 5000 consumes way to much memory.
# raise ValueError("To many repetitions requested (%d)" % max)
max = 5000
optlist = handle_tok(subtok, subval)
iterlist = []
for x in range(min, max + 1):
joined = IT.product(*[optlist]*x)
return (''.join(it) for it in IT.chain(*iterlist))
def handle_range(val):
lo, hi = val
return (chr(x) for x in range(lo, hi + 1))
def handle_subpattern(val):
return list(permute_toks(val[1]))
def handle_tok(tok, val):
Returns a list of strings of possible permutations for this regexp
handlers = {
sc.ANY : handle_any,
sc.BRANCH : handle_branch,
sc.CATEGORY : handle_category,
sc.LITERAL : handle_literal,
sc.IN : handle_in,
sc.MAX_REPEAT : handle_max_repeat,
sc.RANGE : handle_range,
sc.SUBPATTERN : handle_subpattern}
return handlers[tok](val)
except KeyError, e:
fmt = "Unsupported regular expression construct: %s"
raise ValueError(fmt % tok)
def permute_toks(toks):
Returns a generator of strings of possible permutations for this
regexp token list.
lists = [handle_tok(tok, val) for tok, val in toks]
return (''.join(it) for it in IT.product(*lists))
########## PUBLIC API ####################
def ipermute(p):
return permute_toks(sre_parse.parse(p))
You could apply the substitutions given rex and data, and then use inverse_regex.ipermute to generate strings that match the original regex:
import re
import itertools as IT
import inverse_regex as ire
rex = r"(?:at (?P<hour>[0-2][0-9])|today) send email to (?P<name>\w*):? (?P<message>.+)"
match = re.match(rex, "at 10 send email to bob: hi bob!")
data = match.groupdict()
del match
new_regex = re.sub(r'[(][?]P<([^>]+)>[^)]*[)]', lambda m: data.get(, rex)
for s in IT.islice(ire.ipermute(new_regex), 10):
today send email to bob hi bob!
today send email to bob: hi bob!
at 10 send email to bob hi bob!
at 10 send email to bob: hi bob!
Note: I modified the original inverse_regex to not raise a ValueError when the regex contains *s. Instead, the * is changed to be effectively like {,5000} so you'll at least get some permutations.

This is one of the texts that will match the regex:
'at {hour} send email to {name}: {message}'.format(**match.groupdict())'


Building Abreviations Dictionary from Text file

I would like to build a dictionary of abreviations.
I have a text file with a lot of abreviations. The text file looks like this(after import)
with open('abreviations.txt') as ab:
ab_words =
An extract:
'Access Control Entry',
'A Completely Obsessive Really Nutty person',
Now I want to build the dictionnary, where I have every uneven line as a dictionary key and every even line as the dictionary value.
Hence I should be able to write at the end:
and get the result:
'Access Control Entry'
Also, How can I make it case-insensitive ?
should yield the same result
'Access Control Entry'
In fact, it would be perfect, if the output would also be lower case:
'access control entry'
Here is a link to the text file:
Complete solution with custom ABDict class and Python's generator functionality:
class ABDict(dict):
''' Class representing a dictionary of abbreviations'''
def __getitem__(self, key):
v = dict.__getitem__(self, key.upper())
return v.lower() if key.islower() else v
with open('abbreviations.txt') as ab:
ab_dict = ABDict()
while True:
k = next(ab).strip() # `key` line
v = next(ab).strip() # `value` line
ab_dict[k] = v
except StopIteration:
Now, testing (with case-relative access):
print('*' * 10)
The output(consecutively):
Access Control Entry
access control entry
Wish You The Best
wish you the best
Here's another solution based on the pairwise function from this solution:
from requests.structures import CaseInsensitiveDict
def pairwise(iterable):
"s -> (s0, s1), (s2, s3), (s4, s5), ..."
a = iter(iterable)
return zip(a, a)
with open('abreviations.txt') as reader:
abr_dict = CaseInsensitiveDict()
for abr, full in pairwise(reader):
abr_dict[abr.strip()] = full.strip()
Here is an answer that also allows sentences to be replaced with words from the dictionary:
import re
from requests.structures import CaseInsensitiveDict
def read_file_dict(filename):
Reads file data into CaseInsensitiveDict
# lists for keys and values
keys = []
values = []
# case sensitive dict
data = CaseInsensitiveDict()
# count used for deciding which line we're on
count = 1
with open(filename) as file:
temp =
for line in temp:
# if the line count is even, a value is being read
if count % 2 == 0:
# otherwise, a key is being read
count += 1
# Add to dictionary
# perhaps some error checking here would be good
for key, value in zip(keys, values):
data[key] = value
return data
def replace_word(ab_dict, sentence):
Replaces sentence with words found in dictionary
# not necessarily words, but you get the idea
words = re.findall(r"[\w']+|[.,!?; ]", sentence)
new_words = []
for word in words:
# if word is in dictionary, replace it and add it to resulting list
if word in ab_dict:
# otherwise add it as normally
# return sentence with replaced words
return "".join(x for x in new_words)
def main():
ab_dict = read_file_dict("abreviations.txt")
print(replace_word(ab_dict, "The ACE is not easy to understand"))
if __name__ == '__main__':
Which outputs:
{'ACE': 'Access Control Entry', 'ACK': 'Acknowledgement', 'ACORN': 'A Completely Obsessive Really Nutty person'}
Access Control Entry
Access Control Entry
Access Control Entry
The Access Control Entry is not easy to understand

Optimizing selenium code

So I wrote some code to grab data about classes at a college to build an interactive scheduler. Here is the code I have to get data:
from selenium import webdriver
import os
import pwd
import shlex
import re
import time
usr = pwd.getpwuid(os.getuid()).pw_name
Path = ('/Users/%s/Downloads/chromedriver') %usr # Have chromedriver dowloaded
# Create a new instance of the Chrome driver
options = webdriver.ChromeOptions()
options.binary_location = '/Applications/Google Chrome'
options.add_argument('headless') # Headless so no window is opened
driver = webdriver.Chrome(Path, chrome_options=options)
driver.get('') # Go to database
classes = {}
def Database(AllSelectedCourseInfo):
ClassDict = {}
for item in AllSelectedCourseInfo: # Go through list of class info
thing = item.split("=") # Split string by = to get subject name and value
name = thing[0]
if any(char.isdigit() for char in thing[1]): # Get rid of annoying Z at the end of numbers
thing[1] = re.sub("[Z]","",thing[1])
value = thing[1]
if value: # If subject has a value, store it
ClassDict[str(name)] = str(value) # Store value in a dictionary with the subject as the key
classes[str(ClassDict["Section"])] = ClassDict # Add to dictionary
def makeDatabase(section):
if "Title" in driver.find_element_by_xpath("//*[text()='%s']"%section).find_element_by_xpath("..").text:
classSection = driver.find_elements_by_xpath("//*[text()='%s']"%section) # If class name given find class
for i in range(0, len(classSection)):
AllSelectedCourseInfo = shlex.split(classSection[i].find_element_by_xpath(".." + "/.."*4).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols
classSection = driver.find_element_by_xpath("//*[text()='%s']"%section) # If class section give, find class
AllSelectedCourseInfo = shlex.split(classSection.find_element_by_xpath(".." + "/.."*3).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols
def printDic():
for key in classes:
print "\n-------------%s------------" %key
for classkey in classes[key]:
print "%s : %s" %(classkey, classes[key][classkey])
start = time.time()
makeDatabase("Differential Calculus")
makeDatabase("MA 124B")
end = time.time()
print end - start
It takes about 20 seconds for me to pull data from one class and one class section, if I am to make this practical it is going to need at least 7 classes, and that would take over a minute just to create the dictionaries. Does anyone know of a way to make this run any faster?
I tried to integrate lxml and requests into my code but it just didn't have what I was looking for. After a few days of trying to use lxml to accomplish this with no avail I decided to try beautifulsoup4 with urllib. This worked better than I could have hoped,
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
import urllib
import shlex
import re
import time
h = HTMLParser()
page = urllib.urlopen('').read() # Get to database
soup = BeautifulSoup(page)
RawClassData = soup.contents[10].contents[0].contents[0].contents
classes = {}
backupClasses = {}
def makeDatabase():
for i in range(0, len(RawClassData)): # Parse through each class
AllSelectedCourseInfo = shlex.split(h.unescape(str(RawClassData[i]).replace(">", " "))) # sort into a list grouping string in quotes and getting rid of unnecessary symbols
ClassDict = {}
for item in AllSelectedCourseInfo: # Go through list of class info
thing = item.split("=") # Split string by = to get subject name and value
name = thing[0]
if any(char.isdigit() for char in thing[1]): # Get rid of annoying Z at the end of numbers
thing[1] = re.sub("[Z]","",thing[1])
value = thing[1]
if value: # If subject has a value, store it
ClassDict[str(name)] = str(value) # Store value in a dictionary with the subject as the key
classes[str(ClassDict["section"])] = ClassDict
def printDic():
with open("Classes", "w") as f:
for key in classes:
f.write("\n-------------%s------------" %key)
for classkey in classes[key]:
f.write( "\n%s : %s" %(classkey, classes[key][classkey]))
def printSection(selection):
print "\n-------------%s------------" %selection
for classkey in classes[selection]:
print "%s : %s" %(classkey, classes[selection][classkey])
def printClass(selection):
for key in classes:
if classes[key]["title"] == selection:
print "\n-------------%s------------" %key
for classkey in classes[key]:
print "%s : %s" %(classkey, classes[key][classkey])
print "\n-------------%s------------" %selection
for classkey in classes[selection]:
print "%s : %s" %(classkey, classes[selection][classkey])
start = time.time()
end = time.time()
printClass("Circuits and Systems")
printClass("Differential Equations")
printClass("Writing & Communications Collqm")
printClass("Mechanics of Solids")
printClass("Electricity & Magnetism")
printClass("Engineering Design III")
printClass("Freshman Quiz")
print end - start
This new code creates a library of all classes then prints out the desired class, all in 2 seconds. The selenium code took 89 seconds to just build the library for the desired classes and print them out, I would say thats a slight improvement... Thanks a ton to perfect5th for the suggestion!

Converting part of string into variable name in python

I have a file containing a text like this:
loadbalancer {
upstream application1 {
upstream application2 {
Does anyone know, how could I extract variables like below:
and so on
If the lines you want always start with upstream and server this should work:
app_dic = {}
with open('file.txt','r') as f:
for line in f:
if line.startswith('upstream'):
app_i = line.split()[1]
server_of_app_i = []
for line in f:
if not line.startswith('server'):
app_dic[app_i] = server_of_app_i
app_dic should then be a dictionary of lists:
{'application1': ['', '', ''],
'application2': ['', '', '']}
If the input file does not contain any newline character, as long as the file is not too large you could write it to a list and iterate over it:
app_dic = {}
with open('file.txt','r') as f:
txt_iter = iter( #iterator of list
for word in txt_iter:
if word == 'upstream':
app_i = next(txt_iter)
for word in txt_iter:
if word == 'server':
elif word == '}':
app_dic[app_i] = server_of_app_i
This is more ugly as one has to search for the closing curly bracket to break. If it gets any more complicated, regex should be used.
If you are able to use the newer regex module by Matthew Barnett, you can use the following solution, see an additional demo on
import regex as re
rx = re.compile(r"""
(?:(?P<application>application\d)\s{\n| # "application" + digit + { + newline
(?!\A)\G\n) # assert that the next match starts here
server\s # match "server"
(?P<server>[\d.:]+); # followed by digits, . and :
""", re.VERBOSE)
string = """
loadbalancer {
upstream application1 {
upstream application2 {
result = {}
for match in rx.finditer(string):
current ='application')
result[current] = list()
if current:
print result
# {'application2': ['', '', ''], 'application1': ['', '', '']}
This makes use of the \G modifier, named capture groups and some programming logic.
This is the basic method:
# each of your objects here
objText = "xyz xcyz"
listOfAll = re.findall(r"/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?):[0-9]{1,5}/g", objText)
for eachMatch in listOfAll:
print "Here's one!" % eachMatch
Obviously that's a bit rough around the edges, but it will perform a full-scale regex search of whatever string it's given. Probably a better solution would be to pass it the objects themselves, but for now I'm not sure what you would have as raw input. I'll try to improve on the regex, though.
I believe this as well can be solved with re:
>>> import re
>>> from collections import defaultdict
>>> APP = r'\b(?P<APP>application\d+)\b'
>>> IP = r'server\s+(?P<IP>[\d\.:]+);'
>>> pat = re.compile('|'.join([APP, IP]))
>>> scan = pat.scanner(s)
>>> d = defaultdict(list)
>>> for m in iter(, None):
group = m.lastgroup
if group == 'APP':
keygroup =
>>> d
defaultdict(<class 'list'>, {'application1': ['', '', ''], 'application2': ['', '', '']})
Or similarly with re.finditer method and without pat.scanner:
>>> for m in re.finditer(pat, s):
group = m.lastgroup
if group == 'APP':
keygroup =
>>> d
defaultdict(<class 'list'>, {'application1': ['', '', ''], 'application2': ['', '', '']})

Printing values from dictionary in specific form

I have a dictionary with keys relating to various reactions and their data ie. exponentn, comment etc. I want to search and print a list of reactions concerning the atom 'BR'. My code currently prints all reactions for 'BR' and the data in random order. I am not sure which data corresponds to which reaction.
I've had a go at trying to use the repr function to output the data as follows but I'm not having much luck: reactionName : exponentn comment I found another question which I tried to replicate but was not able to do so; printing values and keys from a dictionary in a specific format (python).
class SourceNotDefinedException(Exception):
def __init__(self, message):
super(SourceNotDefinedException, self).__init__(message)
class tvorechoObject(object):
"""The class stores a pair of objects, "tv" objects, and "echo" objects. They are accessed simply by doing .tv, or .echo. If it does not exist, it will fall back to the other variable. If neither are present, it returns None."""
def __init__(self, echo=None, tv=None): = tv
self.echo = echo
def __repr__(self):
return str({"echo": self.echo, "tv":}) # Returns the respective strings
def __getattribute__(self, item):
"""Altered __getattribute__() function to return the alternative of .echo / .tv if the requested attribute is None."""
if item in ["echo", "tv"]:
if object.__getattribute__(self,"echo") is None: # Echo data not present
return object.__getattribute__(self,"tv") # Select TV data
elif object.__getattribute__(self,"tv") is None: # TV data not present
return object.__getattribute__(self,"echo") # Select Echo data
return object.__getattribute__(self,item) # Return all data
return object.__getattribute__(self,item) # Return all data
class Reaction(object):
def __init__(self, inputLine, sourceType=None):
#self.reactionName = QVTorQPObject()
self.exponentn = QVTorQPObject()
self.comment = QVTorQPObject()
self.readIn(inputLine, sourceType=sourceType)
products, reactants = self.reactionName.split(">")
self.products = [product.strip() for product in products.split("+")]
self.reactants = [reactant.strip() for reactant in reactants.split("+")]
def readIn(self, inputLine, sourceType=None):
if sourceType == "echo": # Parsed reaction line for combined format
echoPart = inputLine.split("|")[0]
reactionName = inputLine.split(":")[0].strip()
exponentn = echoPart.split("[")[1].split("]")[0].strip() # inputLine.split("[")[1].split("]")[0].strip()
comment = "%".join(echoPart.split("%")[1:]).strip() # "%".join(inputLine.split("%")[1:]).strip()
# Store the objects
self.reactionName = reactionName
self.exponentn.echo = exponentn
self.comment.echo = comment
elif sourceType == "tv": # Parsed reaction line for combined format
tvPart = inputLine.split("|")[1]
reactionName = inputLine.split(":")[0].strip()
comment = "%".join(tvPart.split("!")[1:]).strip() # "%".join(inputLine.split("!")[1:]).strip()
# Store the objects
self.reactionName = reactionName = comment
elif sourceType.lower() == "unified":
reaction = inputLine.split(":")[0]
echoInput, tvInput = ":".join(inputLine.split(":")[1:]).split("|")
echoInput = reaction + ":" + echoInput
tvInput = reaction + ":" + tvInput
if "Not present in TV" not in tvInput:
self.readIn(inputLine, sourceType="tv")
if "Not present in Echo" not in echoInput:
self.readIn(inputLine, sourceType="echo")
raise SourceNotDefinedException("'%s' is not a valid 'sourceType'" % sourceType) # Otherwise print
def __repr__(self):
return str({"reactionName": self.reactionName, "exponentn": self.exponentn, "comment": self.comment, })
return str(self.reactionName) # Returns all relevant reactions
keykeyDict = {}
for key in reactionDict.keys():
keykeyDict[key] = key
formatString = "{reactionName:<40s} {comment:<10s}" # TV format
formatString = "{reactionName:<40s} {exponentn:<10s} {comment:<10s}" # Echo format
return formatString.format(**keykeyDict)
return formatString.format(**reactionDict)
def toDict(self, priority="tv"):
"""Returns a dictionary of all the variables, in the form {"comment":<>, "exponentn":<>, ...}. Design used is to be passed into the echo and tv style line format statements."""
if priority in ["echo", "tv" # Creating the dictionary by a large, horrible, list comprehension, to avoid even more repeated text
return dict([("reactionName", self.reactionName)] + [(attributeName, self.__getattribute__(attributeName).__getattribute__(priority))
for attributeName in ["exponentn", "comment"]])
raise SourceNotDefinedException("{0} source type not recognised.".format(priority)) # Otherwise print
def find_allReactions(allReactions, reactant_set):
reactant_set is the set of reactants that you want to grab all reactions which are relevant allReactions is just the set of reactions you're considering. Need to repeatedly loop through all reactions. If the current reaction only contains reactants in the reactant_set, then add all its products to the reactant set. Repeat this until reactant_set does not get larger.
reactant_set = set(reactant_set) # this means that we can pass a list, but it will always be treated as a set.
#Initialise the list of reactions that we'll eventually return
relevant_reactions = []
previous_reactant_count = None
while len(reactant_set) != previous_reactant_count:
previous_reactant_count = len(reactant_set)
for reaction in allReactions:
if set(reaction.reactants).issubset(reactant_set):
reactant_set = reactant_set.union(set(reaction.products))
return relevant_reactions
print find_allReactions(allReactions, ["BR"])
Current output:
'{'exponentn': {'tv': '0', 'echo': '0'}, 'comment': {'tv': 'BR-NOT USED', 'echo': 'BR-NOT USED'},'reactionName': 'E + BR > BR* + E', {'exponentn': {'qvt': '0', 'qp': '0'}, 'comment': {'qvt': 'BR+ -RECOMBINATION', 'qp': 'BR+ -RECOMBINATION'},'reactionName': 'E + BR* > BR* + E'
Desired output: reactionName exponentn comment
E + BR* > BR* + E 0 BR-NOT USED
If your data is added into the dict in a certain order, and you want to preserve that order, collections.OrderedDict is what you're looking for.

Python / json : Check content of keys that may or may not exist

For those familiar with imageboards, an OP post may or may not contain a 'subject' and a 'comment'
I wrote this to search all pages of a given board for thread subjects and OP posts.
If my search term exists on one of them but the other key is inexistent it will not get appended to my res list.
So how do I search json keys where 1 key or the other may not exist?
import urllib, json, HTMLParser
def s4Chan(board, search):
logo = '3::54chan'
res = []
p = HTMLParser.HTMLParser()
catalog = json.load(urllib.urlopen('' % board))
for i in catalog:
for j in i['threads']:
if search.lower() in j['sub'].lower() or search.lower() in j['com'].lower():
subject = j['sub']
post = p.unescape(str(j['com'])).replace('<br>', ' ')
if len(post) > 300:
post = post[0:300]
post = post + '...'
text = str('%s /%s/ %s | %s | %s (R:%s, I:%s)' % (logo, board, subject, post, '' % (board, j['no']), j['replies'], j['images']))
return res
json.load returns objects as Python dictionaries. You can, for example, use the get method of dict:
if search.lower() in j.get('sub', '').lower() or search.lower() in j.get('com', '').lower():
