Parse a txt file and store data into a dictionary - python

I have a set of data that I would like to extract from a txt file and stored in a specific format. The data is is currently in a txt file like so:
set firewall family inet filter INBOUND term TEST from source-address 1.1.1.1/32
set firewall family inet filter INBOUND term TEST from destination-prefix-list test-list
set firewall family inet filter INBOUND term TEST from protocol udp
set firewall family inet filter INBOUND term TEST from destination-port 53
set firewall family inet filter INBOUND term TEST then accept
set firewall family inet filter PROD term LAN from source-address 4.4.4.4/32
set firewall family inet filter PROD term LAN from source-address 5.5.5.5/32
set firewall family inet filter PROD term LAN from protocol tcp
set firewall family inet filter PROD term LAN from destination-port 443
set firewall family inet filter PROD term LAN then deny
I would like the data to be structured to where each rule has their respective options placed into dictionary and appended to a list. For example:
Expected Output
[{'Filter': 'INBOUND', 'Term': 'TEST', 'SourceIP': '1.1.1.1/32', 'DestinationList': 'test-list', 'Protocol': 'udp', 'DestinationPort': '53', 'Action': 'accept},
{'Filter': 'PROD', 'Term': 'LAN', 'SourceIP': ['4.4.4.4/32','5.5.5.5/32'], 'Protocol': 'tcp', 'DestinationPort': '443', 'Action': 'deny'}]
As you can see there may be instances where a certain trait does not exist for a rule. I would also have to add multiple IP addresses as a value. I am currently using Regex to match the items in the txt file. My thought was to iterate through each line in the file, find any matches and add them as a key-value pair to a dictionary.
Once I get an "accept" or "deny", that should signal the end of the rule and I will append the dictionary to the list, clear the dictionary and start the process with the next rule. However this does not seem to be working as intended. My Regex seems fine but I cant seem to figure out the logic when processing each line, adding multiple values to a value list, and adding values to the dictionary. Here is my code below
import re
data_file = "sample_data.txt"
##### REGEX PATTERNS #####
filter_re = r'(?<=filter\s)(.*)(?=\sterm.)'
term_re = r'(?<=term\s)(.*)(?=\sfrom|\sthen)'
protocol_re = r'(?<=protocol\s)(.*)'
dest_port_re = r'(?<=destination-port\s)(.*)'
source_port_re = r'(?<=from\ssource-port\s)(.*)'
prefix_source_re = r'(?<=from\ssource-prefix-list\s)(.*)'
prefix_dest_re = r'(?<=from\sdestination-prefix-list\s)(.*)'
source_addr_re = r'(?<=source-address\s)(.*)'
dest_addr_re = r'(?<=destination-address\s)(.*)'
action_re = r'(?<=then\s)(deny|accept)'
pattern_list = [filter_re, term_re, source_addr_re, prefix_source_re, source_port_re, dest_addr_re, prefix_dest_re, dest_port_re, protocol_re, action_re]
pattern_headers = ["Filter", "Term", "Source_Address", "Source_Prefix_List", "Source_Port", "Destination_Address," "Destination_Prefix_List", "Destination_Port", "Protocol", "Action"]
final_list = []
def open_file(file):
rule_dict = {}
with open(file, 'r') as f:
line = f.readline()
while line:
line = f.readline().strip()
for header, pattern in zip(pattern_headers,pattern_list):
match = re.findall(pattern, line)
if len(match) != 0:
if header != 'accept' or header != 'deny':
rule_dict[header] = match[0]
else:
rule_dict[header] = match[0]
final.append(rule_dict)
rule_dict = {}
print(rule_dict)
print(final_list)
The final list is empty and the rule_dict only contains the final rule from the text file not the both of the rulesets. Any guidance would be greatly appreciated.

There are few little mistakes in your code:
in your while loop f.readline() needs to be at the end, otherwise you already begin in line 2 (readline called twice before doing anything)
final_list has to be defined in your function and also used
correctly then (instead of only "final"
if header != 'accept' or header != 'deny':: here needs to be an and. One of them is always True, so the else part never gets executed.
you need to check the match for accept|deny, not the header
for example in Source_IP you want to have a list with all IP's you find. The way you do it, the value would always be updated and only the last found IP will be in your final_list
def open_file(file):
final_list = []
rule_dict = {}
with open(file) as f:
line = f.readline()
while line:
line = line.strip()
for header, pattern in zip(pattern_headers, pattern_list):
match = re.findall(pattern, line)
if len(match) != 0:
if (match[0] != "accept") and (match[0] != "deny"):
rule_dict.setdefault(header, set()).add(match[0])
else:
rule_dict.setdefault(header, set()).add(match[0])
#adjust values of dict to list (if multiple values) or just a value (instead of set) before appending to list
final_list.append({k:(list(v) if len(v)>1 else v.pop()) for k,v in rule_dict.items()})
rule_dict = {}
line = f.readline()
print(f"{rule_dict=}")
print(f"{final_list=}")
open_file(data_file)
Output:
rule_dict={}
final_list=[
{
'Filter': 'INBOUND',
'Term': 'TEST',
'Source_Address': '1.1.1.1/32',
'Destination_Prefix_List': 'test-list',
'Protocol': 'udp', 'Destination_Port': '53',
'Action': 'accept'
},
{
'Filter': 'PROD',
'Term': 'LAN',
'Source_Address': ['5.5.5.5/32', '4.4.4.4/32'],
'Protocol': 'tcp',
'Destination_Port': '443',
'Action': 'deny'
}
]

There are few things that i have change in your code:
When "accept" and "deny" found in action then append final_dict in final_list and empty final_dict
allow to add more than one SourceIP- for that create list in value of SourceIP when more than SourceIP get
import re
data_file = "/home/hiraltalsaniya/Documents/Hiral/test"
filter_re = r'(?<=filter\s)(.*)(?=\sterm.)'
term_re = r'(?<=term\s)(.*)(?=\sfrom|\sthen)'
protocol_re = r'(?<=protocol\s)(.*)'
dest_port_re = r'(?<=destination-port\s)(.*)'
source_port_re = r'(?<=from\ssource-port\s)(.*)'
prefix_source_re = r'(?<=from\ssource-prefix-list\s)(.*)'
prefix_dest_re = r'(?<=from\sdestination-prefix-list\s)(.*)'
source_addr_re = r'(?<=source-address\s)(.*)'
dest_addr_re = r'(?<=destination-address\s)(.*)'
action_re = r'(?<=then\s)(deny|accept)'
pattern_list = [filter_re, term_re, source_addr_re, prefix_source_re, source_port_re, dest_addr_re, prefix_dest_re,
dest_port_re, protocol_re, action_re]
pattern_headers = ["Filter", "Term", "SourceIP", "Source_Prefix_List", "Source_Port", "Destination_Address",
"DestinationList", "Destination_Port", "Protocol", "Action"]
def open_file(file):
final_dict: dict = dict()
final_list: list = list()
with open(file) as f:
for line in f:
for header, pattern in zip(pattern_headers, pattern_list):
match = re.search(pattern, line)
if match:
# check if accept or deny it means the end of the rule then empty dictionary
if str(match.group()) == "accept" or match.group() == "deny":
final_list.append(final_dict)
final_dict: dict = dict()
# if more than one SourceIP then create list of SourceIP
elif header == "SourceIP" and header in final_dict.keys():
final_dict[header] = [final_dict[header]]
final_dict.setdefault(header, final_dict[header]).append(match.group())
else:
final_dict[header] = match.group()
print("final_list=", final_list)
open_file(data_file)
Output:
final_list= [{'Filter': 'INBOUND',
'Term': 'TEST',
'SourceIP': '1.1.1.1/32',
'DestinationList': 'test-list',
'Protocol': 'udp',
'Destination_Port': '53'
},
{'Filter': 'PROD',
'Term': 'LAN',
'SourceIP': ['4.4.4.4/32', '5.5.5.5/32'],
'Protocol': 'tcp',
'Destination_Port': '443'
}]

Related

Python looping list and append value to variable

I have a .csv file with IPs which I converted into a list with Python:
def ip_list():
iplist = []
with open("/path/to/file") as csvfile:
csvlist = csv.reader(csvfile)
for lists in csvlist:
for item in lists:
iplist.append(item)
return iplist
ip = ip_list()
print(ip)
>>> ["192.168.1.1", "192.168.1.2", ...]
Now I want to have every value in the list and append them to a given parameter each time.
Function for context:
def gencontent(ip, value1, value2, time):
content = [
{
"example": {
"ipadress": ip
}
}
]
return content
ip = ["192.168.1.1", "192.168.1.2", "192.168.1.3"]
content = getcontent(ip[0-...], value1, value2, time)
I want loop content with each value in ip:
#Example list for reproduction
ip = ["192.168.1.1", "192.168.1.2", "192.168.1.3"]
content = getcontent(ip[0-...], ...)
I do not want:
#Example list for reproduction
ip = ["192.168.1.1", "192.168.1.2", "192.168.1.3"]
content1 = getcontent(ip[0], ...)
content2 = getcontent(ip[1], ...)
...
I want to loop content basically each time with a new ip value.
Thanks!
I don't know what the getcontent() function does, but why not loop through the items in your list using a list comprehension?
content = [getcontent(x) for x in ip]
If you simply want to index them, maybe you could convert to a tuple and use enumerate.
For example:
ip = ["192.168.1.1", "192.168.1.2", "192.168.1.3"]
indexed_ip = enumerate(tuple(ip))
print(list(indexed_ip))
# OUTPUT:
# [(0, '192.168.1.1'), (1, '192.168.1.2'), (2, '192.168.1.3')]
Or if you want the index to start at 1, instead of 0:
ip = ["192.168.1.1", "192.168.1.2", "192.168.1.3"]
indexed_ip = enumerate(tuple(ip), 1)
print(list(indexed_ip))
# OUTPUT:
# [(1, '192.168.1.1'), (2, '192.168.1.2'), (3, '192.168.1.3')]
Alternatively, maybe a dictionary work for you in this situation.
Here’s an example using dictionary comprehension:
ip_dict = { ip.index(ip_item): ip_item for ip_item in ip}
print(ip_dict)
# OUTPUT:
# {0: '192.168.1.1', 1: '192.168.1.2', 2: '192.168.1.3'}
You can name the keys for the dictionary, whatever you’d like. if you’re sent on content0, content1, etc, you could change the key value in the dict comprehension to something like f’content{str(ip.index(ip_item))}’. Then you could get the value from the ip_dict using ip_dict['content1'] and etc.
can you be more specific about content = getcontent(ip[0-...])?
i don't know whether i get you.
maybe something like this?
ip = ["192.168.1.1", "192.168.1.2", "192.168.1.3"]
def getip(li):
for item in li:
yield(item)
ipgetter = getip(ip)
content = getcontent(next(ipgetter), value1, value2, time) # getcontent got "192.168.1.1"
content = getcontent(next(ipgetter), value1, value2, time) # getcontent got "192.168.1.2"
if loop is in an end, an StopIteration Exception will being raised

How can I prefix a value in a python dictionary used for an Ansible dynamic inventory?

I'm working on a custom Ansible dynamic inventory python script. I have created groups from k=v pairs, but for certain groups, I want the key prefixed to the values, otherwise the group names are meaningless (1,2,3, etc.)
I've tried sticking the key name in various places, but without a proper understanding of what I'm doing. In the example below, I am trying to get the "bucket" group to have every value look something like bucket_3 (which would then be the Ansible group name).
result = {
'all': {
'hosts': [],
'vars': {},
},
'_meta': {
'hostvars': {}
}
}
server = ''
for raw_line in output.split('\n'):
line = raw_line.strip()
if len(line) > 0 and not line.startswith(comment_char):
if line.endswith(server_char):
server = line[:-1]
result['all']['hosts'].append(server)
result['_meta']['hostvars'][server] = {}
else:
raw_key, raw_value = line.split('=', 1)
key = raw_key.strip()
value = raw_value.strip()
result['_meta']['hostvars'][server][key] = value
if key == 'ansible_groups':
for group in value.split(","):
if group not in result.keys():
result[group] = {'hosts': [], 'vars': {}}
result[group]['hosts'].append(server)
if key == 'bucket':
for group in value:
if group not in result.keys():
result[group] = 'bucket_' + {'hosts': [], 'vars': {}}
result[group]['hosts'].append(server)
I expect to get groups such as bucket_1, bucket_2, etc. (The source has 'bucket = 1', 'bucket = 2', etc.).
Getting error "'bucket_' + {'hosts': [], 'vars': {}} TypeError: cannot concatenate 'str' and
'dict' objects"
granted, this is just my latest attempt, so errors have been varied as I try to find the correct way to modify the group name.
nevermind...just not thinking.
if key == 'bucket':
for group in value:
group = 'bucket_' + group
if group not in result.keys():
still a bit slower than I would like, but it is functional

Displaying information in a list using regex and .group()

I would like to display a list containing information from some text files using Python.
What I want to display :
[host_name, hardware_ethernet_value, fixed_address_value]
An example of a file (random examples):
host name-random-text.etc {
hardware ethernet 00:02:99:aa:xx:yc;
fixed-address 1.3.0.155;
}
host name-another-again.etc {
hardware ethernet 00:02:99:aa:xx:yc;
fixed-address 3.5.0.115;
}
Someone helped me to write a code for that but it doesn't work anymore, though I know where the problem is coming from.
So the code is as follows :
#!/usr/bin/env python
#import modules
import pprint
import re
#open a file
filedesc = open("DATA/fixed.10.3", "r")
#using regex expressions to get the different informations
SCAN = {
'host' : r"^host (\S+) {",
'hardware' : r"hardware ethernet (\S+);",
'fixed-adress' : r"fixed adress (\S+);"
}
item = []
for key in SCAN:
#print(key)
regex = re.compile(SCAN[key])
#print(regex)
for line in filedesc:
#print(line)
match = regex.search(line)
#print(match)
#match = re.search(regex, line)
#if match is not None:
#print(match.group(1))
if match is not None:
#print(match.group(1))
if match.group(1) == key:
print(line)
item += [match.group(2)]
break
#print the final dictionnaries
pp=print(item)
#make sure to close the file after using it with file.close()
What should be expected :
match.group(1) = host
match.group(2) = name-random-text.etc
But what I have is match.group(1) = name-random-text.etc so match.group(2) = nothing here. This is why the condition match.group(1) == key never works, because match.group(1) never takes the values ['host', 'hardware ethernet', 'fixed-address'].
Your reg exp matches only 1 group.
If you want match 2 groups and group 1 should be equal SCAN's key, you need change SCAN like this:
SCAN = {
'host' : r"^(host) (\S+) {",
'hardware' : r"(hardware) ethernet (\S+);",
'fixed address' : r"(fixed address) (\S+);"
}
Very simple, but working decision:
#!/usr/bin/env python
#import modules
import pprint
import re
#open a file
file_lines = """
host name-random-text.etc {
hardware ethernet 00:02:99:aa:xx:yc;
fixed-address 1.3.0.155;
}
host name-another-again.etc {
hardware ethernet 00:02:99:aa:xx:yc;
fixed-address 3.5.0.115;
}
""".split('\n')
SCAN = {
'host': r"^host (\S+) {",
'hardware': r"hardware ethernet (\S+);",
'fixed_address': r"fixed-address (\S+);",
'end_item': r"^\s*}$\s*",
}
# Compile only once, if not want repeat re.compile above
for key, expr in SCAN.iteritems():
SCAN[key] = re.compile(expr)
items = []
item = {}
for line in file_lines:
for key, expr in SCAN.iteritems():
m = expr.search(line)
if not m:
continue
if key == 'end_item':
items.append(item)
print "Current item", [item.get(x) for x in ['host', 'hardware', 'fixed_address']]
item = {}
else:
item[key] = m.group(1)
print "Full list of items"
pprint.pprint(items)
Output:
Current item ['name-random-text.etc', '00:02:99:aa:xx:yc', '1.3.0.155']
Current item ['name-another-again.etc', '00:02:99:aa:xx:yc', '3.5.0.115']
Full list of items
[{'fixed_address': '1.3.0.155',
'hardware': '00:02:99:aa:xx:yc',
'host': 'name-random-text.etc'},
{'fixed_address': '3.5.0.115',
'hardware': '00:02:99:aa:xx:yc',
'host': 'name-another-again.etc'}]

How to initialise data structure once in python loop

I am trying to build up a data structure from a CSV file. The CSV file contents are below.
‘Windows 8’,10.1.1.1,’Windows 8 Server’,’SiteA’
‘Windows 8’,10.2.2.2,’Windows 8 Server’,’SiteB’
‘Cisco Router,’172.16.1.1’,’Cisco Router 881’,’SiteA’
‘Cisco Router,’172.16.1.3’,’Cisco Router 881’,’SiteC’
‘Cisco Router,’172.16.1.4’,’Cisco Router 881’,’SiteB’
I am trying to group the data by Device Type, then Site and have a list of common ip addresses along with the description.
The problem I am having is I cannot work out to ensure I am only initialising the various parts of the data structure only one.
Below is my code.
import csv
import pprint
data = {}
pp = pprint.PrettyPrinter(indent=4)
f = open('/Users/marcos/Desktop/vulns/data.csv', 'rt')
try:
reader = csv.reader(f)
for row in reader:
product = row[0]
ip = row[1]
description = row[2]
site = row[3]
try:
data[product][site]['ipaddresses'].append(ip)
data[product][site]['description'] = description
except:
data[product] = {}
data[product][site] = {}
data[product][site]['ipaddresses'] = []
data[product][site]['ipaddresses'].append(ip)
data[product][site]['description'] = description
finally:
f.close()
pp.pprint(data)
What I am currently getting is the following, which is because my except is always triggering I believe
{ '‘Cisco Router': { '’SiteB’': { 'description': '’Cisco Router 881’',
'ipaddresses': ['’172.16.1.4’']}},
'‘Windows 8’': { '’SiteB’': { 'description': '’Windows 8 Server’',
'ipaddresses': ['10.2.2.2']}}}
Raising an exception is useful in showing what is actually wrong. When I did this I saw KeyErrors, so I used this approach:
try:
reader = csv.reader(f)
for row in reader:
product = row[0]
ip = row[1]
description = row[2]
site = row[3]
try:
if product not in data:
data[product] = {}
if site not in data[product]:
data[product][site] = {}
if 'description' not in data[product][site]:
data[product][site]['description'] = description
if 'ipaddresses' not in data[product][site]:
data[product][site]['ipaddresses'] = []
data[product][site]['ipaddresses'].append(ip)
data[product][site]['description'] = description
except Exception, e:
raise
finally:
f.close()
pp.pprint(data)
Notice that I am creating any keys, lists, or dicts that are needed before trying to work with them.
This gives me the following output:
{ 'Cisco Router': { 'SiteA': { 'description': 'Cisco Router 881',
'ipaddresses': ['172.16.1.1']},
'SiteB': { 'description': 'Cisco Router 881',
'ipaddresses': ['172.16.1.4']},
'SiteC': { 'description': 'Cisco Router 881',
'ipaddresses': ['172.16.1.3']}},
'Windows 8': { 'SiteA': { 'description': 'Windows 8 Server',
'ipaddresses': ['10.1.1.1']},
'SiteB': { 'description': 'Windows 8 Server',
'ipaddresses': ['10.2.2.2']}}}
Here is an approach using the .setdefault method. When used in a loop it does exactly what you're asking for: It initialises the value if the key does not exist, otherwise it returns the stored value.
I personally like it but I can see how others don't because it makes nested lookups a bit harder to read. It's a matter of taste:
reader = """
‘Windows 8’,10.1.1.1,’Windows 8 Server’,’SiteA’
‘Windows 8’,10.2.2.2,’Windows 8 Server’,’SiteB’
‘Cisco Router,’172.16.1.1’,’Cisco Router 881’,’SiteA’
‘Cisco Router,’172.16.1.3’,’Cisco Router 881’,’SiteC’
‘Cisco Router,’172.16.1.4’,’Cisco Router 881’,’SiteB’
"""
reader = [line.split(',') for line in reader.replace("'", '').strip().split('\n')]
data = {}
for row in reader:
product, ip, description, site = row[:4]
site_data = data.setdefault(product, {}).setdefault(site, {})
site_data.setdefault('ipaddresses', []).append(ip)
site_data['description'] = description
import pprint
pprint.pprint(data)
Prints:
{'‘Cisco Router': {'’SiteA’': {'description': '’Cisco Router 881’',
'ipaddresses': ['’172.16.1.1’']},
'’SiteB’': {'description': '’Cisco Router 881’',
'ipaddresses': ['’172.16.1.4’']},
'’SiteC’': {'description': '’Cisco Router 881’',
'ipaddresses': ['’172.16.1.3’']}},
'‘Windows 8’': {'’SiteA’': {'description': '’Windows 8 Server’',
'ipaddresses': ['10.1.1.1']},
'’SiteB’': {'description': '’Windows 8 Server’',
'ipaddresses': ['10.2.2.2']}}}
This seems like a useful time to use pandas.
import pandas as pd
data_ = pd.read_csv('path-to-data.csv')
data_.columns = ['product', 'ip', 'description', 'site']
# Create a 'grouped' dataset object
grouped = df.groupby(['product', 'site', 'ip'])
# Create a dataset with a list of unique 'description' values,
# grouped by columns above
unique_desc_by_group = grouped['description'].aggregate(lambda x: tuple(x))
print(unique_desc_by_group)

Converting part of string into variable name in python

I have a file containing a text like this:
loadbalancer {
upstream application1 {
server 127.0.0.1:8082;
server 127.0.0.1:8083;
server 127.0.0.1:8084;
}
upstream application2 {
server 127.0.0.1:8092;
server 127.0.0.1:8093;
server 127.0.0.1:8094;
}
}
Does anyone know, how could I extract variables like below:
appList=["application1","application2"]
ServerOfapp1=["127.0.0.1:8082","127.0.0.1:8083","127.0.0.1:8084"]
ServerOfapp2=["127.0.0.1:8092","127.0.0.1:8093","127.0.0.1:8094"]
.
.
.
and so on
If the lines you want always start with upstream and server this should work:
app_dic = {}
with open('file.txt','r') as f:
for line in f:
if line.startswith('upstream'):
app_i = line.split()[1]
server_of_app_i = []
for line in f:
if not line.startswith('server'):
break
server_of_app_i.append(line.split()[1][:-1])
app_dic[app_i] = server_of_app_i
app_dic should then be a dictionary of lists:
{'application1': ['127.0.0.1:8082', '127.0.0.1:8083', '127.0.0.1:8084'],
'application2': ['127.0.0.1:8092', '127.0.0.1:8093', '127.0.0.1:8094']}
EDIT
If the input file does not contain any newline character, as long as the file is not too large you could write it to a list and iterate over it:
app_dic = {}
with open('file.txt','r') as f:
txt_iter = iter(f.read().split()) #iterator of list
for word in txt_iter:
if word == 'upstream':
app_i = next(txt_iter)
server_of_app_i=[]
for word in txt_iter:
if word == 'server':
server_of_app_i.append(next(txt_iter)[:-1])
elif word == '}':
break
app_dic[app_i] = server_of_app_i
This is more ugly as one has to search for the closing curly bracket to break. If it gets any more complicated, regex should be used.
If you are able to use the newer regex module by Matthew Barnett, you can use the following solution, see an additional demo on regex101.com:
import regex as re
rx = re.compile(r"""
(?:(?P<application>application\d)\s{\n| # "application" + digit + { + newline
(?!\A)\G\n) # assert that the next match starts here
server\s # match "server"
(?P<server>[\d.:]+); # followed by digits, . and :
""", re.VERBOSE)
string = """
loadbalancer {
upstream application1 {
server 127.0.0.1:8082;
server 127.0.0.1:8083;
server 127.0.0.1:8084;
}
upstream application2 {
server 127.0.0.1:8092;
server 127.0.0.1:8093;
server 127.0.0.1:8094;
}
}
"""
result = {}
for match in rx.finditer(string):
if match.group('application'):
current = match.group('application')
result[current] = list()
if current:
result[current].append(match.group('server'))
print result
# {'application2': ['127.0.0.1:8092', '127.0.0.1:8093', '127.0.0.1:8094'], 'application1': ['127.0.0.1:8082', '127.0.0.1:8083', '127.0.0.1:8084']}
This makes use of the \G modifier, named capture groups and some programming logic.
This is the basic method:
# each of your objects here
objText = "xyz xcyz 244.233.233.2:123"
listOfAll = re.findall(r"/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?):[0-9]{1,5}/g", objText)
for eachMatch in listOfAll:
print "Here's one!" % eachMatch
Obviously that's a bit rough around the edges, but it will perform a full-scale regex search of whatever string it's given. Probably a better solution would be to pass it the objects themselves, but for now I'm not sure what you would have as raw input. I'll try to improve on the regex, though.
I believe this as well can be solved with re:
>>> import re
>>> from collections import defaultdict
>>>
>>> APP = r'\b(?P<APP>application\d+)\b'
>>> IP = r'server\s+(?P<IP>[\d\.:]+);'
>>>
>>> pat = re.compile('|'.join([APP, IP]))
>>>
>>>
>>> scan = pat.scanner(s)
>>> d = defaultdict(list)
>>>
>>> for m in iter(scan.search, None):
group = m.lastgroup
if group == 'APP':
keygroup = m.group(group)
continue
else:
d[keygroup].append(m.group(group))
>>> d
defaultdict(<class 'list'>, {'application1': ['127.0.0.1:8082', '127.0.0.1:8083', '127.0.0.1:8084'], 'application2': ['127.0.0.1:8092', '127.0.0.1:8093', '127.0.0.1:8094']})
Or similarly with re.finditer method and without pat.scanner:
>>> for m in re.finditer(pat, s):
group = m.lastgroup
if group == 'APP':
keygroup = m.group(group)
continue
else:
d[keygroup].append(m.group(group))
>>> d
defaultdict(<class 'list'>, {'application1': ['127.0.0.1:8082', '127.0.0.1:8083', '127.0.0.1:8084'], 'application2': ['127.0.0.1:8092', '127.0.0.1:8093', '127.0.0.1:8094']})

Categories