I have a text file with entries that look like this :
JohnDoe
Assignment 9
Reading: NO
header: NO
HW: NO
Solutions: 0
show: NO
Journals: NO
free: NO
Finished: NO
Quiz: 0
Done
Assignment 3
E-book: NO
HW: NO
Readings: NO
Show: 0
Journal: NO
Study: NO
Test: NO
Finished: NO
Quiz: 0
Done
This is a small sample. The file has several students in it. Each student has two assignments under their name and they only pass if the line that starts with "Finished" in each assignment reads "Finished: YES". All of the data under each assignment is disorganized, but somewhere under each assignment a line will say "Finished: YES (or NO)" I need a way to read the file and say whether or not any of the students have passed. So far, I have
def get_entries( file ):
with open( "dicrete.txt.rtf", 'rt') as file:
for line in file:
if "Finished" in line:
finished, answer = line.split(':')
yield finished, answer
# dict takes a sequence of `(key, value)` pairs and turns in into a dict
print dict(get_entries( file ))
I can only get this code to return a single entry (the first "Finished" it reads as key and "YES or NO" as value, which is what I want, but I want it to return Every line in the file that that starts with "Finished". So the sample data I provided I want to return a dict with 2 entries {Finished:"NO" , Finished:"NO"}
Dictionaries can only store one mapping per key. So, you can never have a dictionary that has two different entries for the same key.
Consider using a list of two-tuples instead, like [("Finished", "NO"), ("Finished", "NO")].
Sounds like you need a better data model! Let's look at that, shall we?
Let's define an Assignment class that we can call with all the lines of text between Assignment: # and Finished: YES/NO.
class Assignment(object):
def __init__(self, id, *args, **kwargs):
self.id = id
for key,val in kwargs.items():
setattr(self, key.lower(), val)
finished = getattr(self, 'finished', None)
if finished is None:
raise AttributeError("All assignments must have a 'finished' value")
else:
self.finished = True if finished.lower() == "yes" else False
#classmethod
def from_string(cls, s):
"""Builds an Assignment object from a string
a = Assignment.from_string('''Assignment: 1\nAttributes: Go Here\nFinished: yes''')
>>> a.id
1
>>> a.finished
True"""
d = dict()
id = None
for line in s.splitlines():
key,*val = map(str.strip, line.split(":"))
val = ' '.join(val) or None
if key.lower().startswith('assignment'):
id = int(key.split()[-1])
continue
d[key.lower()] = val
if id is not None:
return cls(id, **d)
else:
raise ValueError("No 'Assignment' field in string {}".format(s))
Once you have your model, you'll need to parse your input. Luckily this is actually pretty simple.
def splitlineson(s, sentinel):
"""splits an iterable of strings into a newline separated string beginning with each sentinel.
>>> s = ["Garbage", "lines", "SENT$", "first", "group", "SENT$", "second", "group"]
>>> splitlineson(s, "SENT$")
iter("SENT$\nfirst\ngroup",
"SENT$\nsecond\ngroup")"""
lines = []
for line in s:
if line.lower().strip().startswith(sentinel.lower()):
if any((sentinel.lower() in line.lower() for line in lines)):
yield "\n".join(lines)
lines = [line.strip()]
else:
if line:
lines.append(line.strip())
yield "\n".join(lines)
with open('path/to/textfile.txt') as inf:
assignments = splitlineson(inf, "assignment ")
assignment_list = [Assignment.from_string(a) for a in assignments]
Related
I am trying to find a name that is in two separate lists I created and have a function check to see if it is there. I know it is checking the list and I have printed out the list to make sure it is stored correctly but it keeps giving me my error statement that the name is not found in the list. Here is the code I have for it.
def readBoyFiles():
boyfile = 'BoyNames.txt'
boyList = []
with open(boyfile, 'r') as lis:
for line in lis:
boyList.append(line)
return boyList
def readGirlFiles():
girlfile = 'GirlNames.txt'
girlList = []
with open(girlfile, 'r') as names:
for line in names:
girlList.append(line)
return girlList
def nameInput():
name = input('Please enter the name you would like to search: ')
list1 = readBoyFiles()
list2 = readGirlFiles()
findName(name, list1)
findName(name, list2)
def findName(name, list):
if name in list:
print('This name is among the most popular!')
else:
print('This name is not among the most popular.')
nameInput()
When I throw in a print statement like print(list1), it gives me the names in this format, ['Jacob\n', ....] and when I test it it prints out my else statement regardless of what I type in for the input. I have also tried checking it with the index function and it tells me that 'Jacob' is not in list if I try that. I feel like I have to be overlooking something because I've written similar code that works properly and this is almost a mirror image of it except with different data types.
Remember to strip your strings! It removes leading and trailing whitespace. Technically, "Jacob" isn't in the list because "Jacob\n" is.
def readBoyFiles():
boyfile = 'BoyNames.txt'
boyList = []
with open(boyfile, 'r') as lis:
for line in lis:
boyList.append(line.strip())
return boyList
def readGirlFiles():
girlfile = 'GirlNames.txt'
girlList = []
with open(girlfile, 'r') as names:
for line in names:
girlList.append(line.strip())
return girlList
A more pythonic version of your code
def load_list(file_name):
with open(file_name, 'r') as f:
return [name.strip() for name in f.readlines()]
def get_lists_and_user_input():
name = raw_input('Please enter the name you would like to search: ')
boys_list = load_list('popular_boys.txt')
girls_list = load_list('popular_girls.txt')
return boys_list, girls_list, name
def check_name(name, lst, _type):
if name in lst:
print('The name {} is a popular {} name'.format(name, _type))
else:
print('The name {} is NOT a popular {} name'.format(name, _type))
boys, girls, _name = get_lists_and_user_input()
check_name(_name, boys, 'boys')
check_name(_name, girls, 'girls')
I am trying to make a simple programme that can help make army lists for a popular tabletop wargame. More as an excercise for my own experience as there are plenty of pre made software packages that do this, but the idea behind it seems fairly straightforward
The programme reads the data for all the units available in an army from a spreadsheet and creates various classes for each unit. The main bit I am looking at now is the options/ upgrades.
In the file I want a straightforward syntax for the option field for each unit. i.e. the following options string itemA, itemB/itemC-3, 2*itemD, itemE/itemF/itemG, itemH/itemI+itemJ would mean
1. you may take itemA (X pts per model)
2. for every 3 models, you may exchange itemB with
a) itemC (net X pts per model)
3. each model may take 2 of itemD (X pts per model)
4. each model may take one of either
a)itemE (X pts per model)
b)itemF (X pts per model)
c)itemG (X pts per model
5. each model may take either
a)itemH (X points per model)
b)itemI and itemJ (X points per model)
At the moment I am processing the string using lots of splits and if statements, that make it very hard to keep track of and assign correctly once the user input their choice.
for index, option in enumerate(self.options):
output = "{}.".format(index+1)
if '-' in option:
sub_option, no_models = option.split('-')
no_models = int(no_models)
print(sub_option)
print(no_models)
output += "For every {} models ".format(no_models)
if '/' in sub_option:
temp_str, temp_options, points_list = exchange_option(sub_option)
else:
temp_str, temp_options, points_list = standard_option(sub_option)
index_points.append(points_list)
temp_options.append(no_models)
index_options.append(temp_options)
else:
if '/' in option:
temp_str, temp_options, points_list = exchange_option(option)
else:
temp_str, temp_options, points_list = standard_option(option)
index_points.append(points_list)
index_options.append(temp_options)
output += temp_str
the *_option() functions are additional helper functions I have defined above which have a similar structure with further if statements within them.
The main question I am asking, is there an easier way to process a code like string such as this? While it works to produce the output in the example above it seems awfully cumbersome to then deal with the user input.
What I am aiming to do is first output the string as given in my example at the top of the question, and then taking the user input index of the given option, modify the associated unit class to have the correct wargear and points value.
I thought about trying to make some kind of options class, but again labelling and defining each option so that they can interact with one another properly seems equally complex, and I feel there must be something more pythonic or just generally better coding practice to processing encoded strings such as this?
So, here's a full blown parser to do that! Now, this only outputs the list as in the previous version of your question, but it shouldn't be too hard to add more features as you want. Also please note that at the moment, the lexer does not error out when a string contains invalid tokens, but that's just a proof-of-concept, so it should be fine.
Part I: the lexer
This tokenises the input string - looks through it from left to right and attempts to classify non-overlapping substrings as instances of tokens. It's to be used before parsing. When given a string, Lexer.tokenize yields a stream of Tokens.
# FILE: lex.py
import re
import enum
class Token:
def __init__(self, type, value: str, lineno: int, pos: int):
self.type, self.value, self.lineno, self.pos = type, value, lineno, pos
def __str__(self):
v = f'({self.value!r})' if self.value else ''
return f'{self.type.name}{v} at {self.lineno}:{self.pos}'
__repr__ = __str__
class Lexer:
def __init__(self, token_types: enum.Enum, tokens_regexes: dict):
self.token_types = token_types
regex = '|'.join(map('(?P<{}>{})'.format, *zip(*((tok.name, regex) for tok, regex in tokens_regexes.items()))))
self.regex = re.compile(regex)
def tokenize(self, string, skip=['space']):
# TODO: detect invalid input
lineno, pos = 0, 0
skip = set(map(self.token_types.__getitem__, skip))
for matchobj in self.regex.finditer(string):
type_name = matchobj.lastgroup
value = matchobj.groupdict()[type_name]
Type = self.token_types[type_name]
if Type == self.token_types.newline: # possibly buggy, but not catastrophic
self.lineno += 1
self.pos = 0
continue
pos = matchobj.end()
if Type not in skip:
yield Token(Type, value, lineno, pos)
yield Token(self.token_types.EOF, '', lineno, pos)
Part II: the parser (with syntax-driven evaluation):
This parses the given stream of tokens provided by lex.Lexer.tokenize and translates individual symbols to English according to the following grammar:
Opt_list -> Option Opt_list_
Opt_list_ -> comma Option Opt_list_ | empty
Option -> Choice | Mult
Choice -> Compound More_choices Exchange
Compound -> item Add_item
Add_item -> plus item Add_item | empty
More_choices -> slash Compound More_choices | empty
Exchange -> minus num | empty
Mult -> num star Compound
The uppercase symbols are nonterminals, the lowercase ones are terminals. There's also a special symbol EOF that's not present here.
Also, take a look at the vital statistics of this grammar. This grammar is LL(1), so we can use an LL(1) recursive descent predictive parser, as shown below.
If you modify the grammar, you should modify the parser accordingly! The methods that do the actual parsing are called parse_<something>, and to change the output of the parser (the Parser.parse function, actually) you should change the return values of these parse_<something> functions.
# FILE: parse.py
import lex
class Parser:
def __init__(self, lexer):
self.string, self.tokens = None, None
self.lexer = lexer
self.t = self.lexer.token_types
self.__lookahead = None
#property
def lookahead(self):
if not self.__lookahead:
try:
self.__lookahead = next(self.tokens)
except StopIteration:
self.__lookahead = lex.Token(self.t.EOF, '', 0, -1)
return self.__lookahead
def next(self):
if self.__lookahead and self.__lookahead.type == self.t.EOF:
return self.__lookahead
self.__lookahead = None
return self.lookahead
def match(self, token_type):
if self.lookahead.type == token_type:
return self.next()
raise SyntaxError(f'Expected {token_type}, got {self.lookahead.type}', ('<string>', self.lookahead.lineno, self.lookahead.pos, self.string))
# THE PARSING STARTS HERE
def parse(self, string):
# setup
self.string = string
self.tokens = self.lexer.tokenize(string)
self.__lookahead = None
self.next()
# do parsing
ret = [''] + self.parse_opt_list()
return ' '.join(ret)
def parse_opt_list(self) -> list:
ret = self.parse_option(1)
ret.extend(self.parse_opt_list_(1))
return ret
def parse_opt_list_(self, curr_opt_number) -> list:
if self.lookahead.type in {self.t.EOF}:
return []
self.match(self.t.comma)
ret = self.parse_option(curr_opt_number + 1)
ret.extend(self.parse_opt_list_(curr_opt_number + 1))
return ret
def parse_option(self, opt_number) -> list:
ret = [f'{opt_number}.']
if self.lookahead.type == self.t.item:
ret.extend(self.parse_choice())
elif self.lookahead.type == self.t.num:
ret.extend(self.parse_mult())
else:
raise SyntaxError(f'Expected {token_type}, got {self.lookahead.type}', ('<string>', self.lookahead.lineno, self.lookahead.pos, self.string))
ret[-1] += '\n'
return ret
def parse_choice(self) -> list:
c = self.parse_compound()
m = self.parse_more_choices()
e = self.parse_exchange()
if not m:
if not e:
ret = f'You may take {" ".join(c)}'
else:
ret = f'for every {e} models you may take item {" ".join(c)}'
elif m:
c.extend(m)
if not e:
ret = f'each model may take one of: {", ".join(c)}'
else:
ret = f'for every {e} models you may exchange the following items with each other: {", ".join(c)}'
else:
ret = 'Semantic error!'
return [ret]
def parse_compound(self) -> list:
ret = [self.lookahead.value]
self.match(self.t.item)
_ret = self.parse_add_item()
return [' '.join(ret + _ret)]
def parse_add_item(self) -> list:
if self.lookahead.type in {self.t.comma, self.t.minus, self.t.slash, self.t.EOF}:
return []
ret = ['with']
self.match(self.t.plus)
ret.append(self.lookahead.value)
self.match(self.t.item)
return ret + self.parse_add_item()
def parse_more_choices(self) -> list:
if self.lookahead.type in {self.t.comma, self.t.minus, self.t.EOF}:
return []
self.match(self.t.slash)
ret = self.parse_compound()
return ret + self.parse_more_choices()
def parse_exchange(self) -> str:
if self.lookahead.type in {self.t.comma, self.t.EOF}:
return ''
self.match(self.t.minus)
ret = self.lookahead.value
self.match(self.t.num)
return ret
def parse_mult(self) -> list:
ret = [f'each model may take {self.lookahead.value} of:']
self.match(self.t.num)
self.match(self.t.star)
return ret + self.parse_compound()
Part III: usage
Here's how to use all of that code:
# FILE: evaluate.py
import enum
from lex import Lexer
from parse import Parser
# these are all the types of tokens present in our grammar
token_types = enum.Enum('Types', 'item num plus minus star slash comma space newline empty EOF')
t = token_types
# these are the regexes that the lexer uses to recognise the tokens
terminals_regexes = {
t.item: r'[a-zA-Z_]\w*',
t.num: '0|[1-9][0-9]*',
t.plus: r'\+',
t.minus: '-',
t.star: r'\*',
t.slash: '/',
t.comma: ',',
t.space: r'[ \t]',
t.newline: r'\n'
}
lexer = Lexer(token_types, terminals_regexes)
parser = Parser(lexer)
string = 'itemA, itemB/itemC-3, 2*itemD, itemE/itemF/itemG, itemH/itemI+itemJ'
print(f'STRING FROM THE QUESTION: {string!r}\nRESULT:')
print(parser.parse(string), '\n\n')
string = input('Enter a command: ')
while string and string.lower() not in {'q', 'quit', 'e', 'exit'}:
try:
print(parser.parse(string))
except SyntaxError as e:
print(f' Syntax error: {e}\n {e.text}\n' + ' ' * (4 + e.offset - 1) + '^\n')
string = input('Enter a command: ')
Example session:
# python3 evaluate.py
STRING FROM THE QUESTION: 'itemA, itemB/itemC-3, 2*itemD, itemE/itemF/itemG, itemH/itemI+itemJ'
RESULT:
1. You may take itemA
2. for every 3 models you may exchange the following items with each other: itemB, itemC
3. each model may take 2 of: itemD
4. each model may take one of: itemE, itemF, itemG
5. each model may take one of: itemH, itemI with itemJ
Enter a command: itemA/b/c/stuff
1. each model may take one of: itemA, b, c, stuff
Enter a command: 4 * anything
1. each model may take 4 of: anything
Enter a command: 5 * anything + more
1. each model may take 5 of: anything with more
Enter a command: a + b + c+ d
1. You may take a with b with c with d
Enter a command: a+b/c
1. each model may take one of: a with b, c
Enter a command: itemA/itemB-2
1. for every 2 models you may exchange the following items with each other: itemA, itemB
Enter a command: itemA+itemB/itemC - 5
1. for every 5 models you may exchange the following items with each other: itemA with itemB, itemC
Enter a command: q
I have the input file :
sun vehicle
one number
two number
reduce command
one speed
five speed
zero speed
speed command
kmh command
I used the following code:
from collections import OrderedDict
output = OrderedDict()
with open('final') as in_file:
for line in in_file:
columns = line.split(' ')
if len(columns) >= 2:
word,tag = line.strip().split()
if output.has_key(tag) == False:
output[tag] = [];
output[tag].append(word)
else:
print ""
for k, v in output.items():
print '<{}> {} </{}>'.format(k, ' '.join(v), k)
output = OrderedDict()
I am getting the output as:
<vehicle> sun </vehicle>
<number> one two </number>
<command> reduce speed kmh </command>
<speed> one five zero </speed>
But my expected output should be:
<vehicle> sun </vehicle>
<number> one two </number>
<command> reduce
<speed> one five zero </speed>
speed kmh </command>
Can someone help me in solving this?
It looks like the output you want to achieve is underspecified!
You presumably want the code to "know in advance" that speed is a part of command, before you get to the line speed command.
To do what you want, you will need a recursive function.
How about
for k, v in output.items():
print expandElements(k, v,output)
and somewhere you define
def expandElements(k,v, dic):
out = '<' +k + '>'
for i in v:
# check each item of v for matches in dic.
# if no match, then out=out+i
# otherwise expand using a recursive call of expandElements()
# and out=out+expandElements
out = out + '<' +k + '>'
It looks like you want some kind of tree structure for your output?
You are printing out with print '<{}> {} </{}>'.format(k, ' '.join(v), k) so all of your output is going to have the form of '<{}> {} </{}>'.
If you want to nest things you are going to need a nested structure to represent them.
For recursivly parsing the input file I would make a class representing the tag. Each tag can have its children. Every children is first a string added manually with tag.children.append("value") or by calling tag.add_value(tag.name, "value").
class Tag:
def __init__(self, name, parent=None):
self.name = name
self.children = []
self.has_root = True
self.parent = parent
def __str__(self):
""" compose string for this tag (recursivly) """
if not self.children:
return self.name
children_str = ' '.join([str(child) for child in self.children])
if not self.parent:
return children_str
return '<%s>%s</%s>' % (self.name, children_str, self.name)
#classmethod
def from_file(cls, file):
""" create root tag from file """
obj = cls('root')
columns = []
with open(file) as in_file:
for line in in_file:
value, tag = line.strip().split(' ')
obj.add_tag(tag, value)
return obj
def search_tag(self, tag):
""" search for a tag in the children """
if self.name == tag:
return self
for i, c in enumerate(self.children):
if isinstance(c, Tag) and c.name == tag:
return c
elif isinstance(c, str):
if c.strip() == tag.strip():
self.children[i] = Tag(tag, self)
return self.children[i]
else:
result = c.search_tag(tag)
if result:
return result
def add_tag(self, tag, value):
"""
add a value, tag pair to the children
Firstly this searches if the value is an child. If this is the
case it moves the children to the new location
Afterwards it searches the tag in the children. When found
the value is added to this tag. If not a new tag object
is created and added to this Tag. The flag has_root
is set to False so the element can be moved later.
"""
value_tag = self.search_tag(value)
if value_tag and not value_tag.has_root:
print("Found value: %s" % value)
if value_tag.parent:
i = value_tag.parent.children.index(value_tag)
value = value_tag.parent.children.pop(i)
value.has_root = True
else:
print("not %s" % value)
found = self.search_tag(tag)
if found:
found.children.append(value)
else:
# no root
tag_obj = Tag(tag, self)
self.children.append(tag_obj)
tag_obj.add_tag(tag, value)
tag_obj.has_root = False
tags = Tag.from_file('final')
print(tags)
I know in this example the speed-Tag is not added twice. I hope that's ok.
Sorry for the long code.
This is a sample of the raw text i'm reading:
ID: 00000001
SENT: to do something
to 01573831
do 02017283
something 03517283
ID: 00000002
SENT: just an example
just 06482823
an 01298744
example 01724894
Right now I'm trying to split it into a lists of lists of lists.
Topmost level list: By the ID so 2 elements here (done)
Next level: Within each ID, split by newlines
Last level: Within each line split the word and ID, for the lines beginning with ID or SENT, it doesn't matter if they are split or not. Between the word and their ID is an indent (\t)
Current code:
f=open("text.txt","r")
raw=list(f)
text=" ".join(raw)
wordlist=text.split("\n \n ") #split by ID
toplist=wordlist[:2] #just take 2 IDs
Edit:
I was going to cross-reference the words to another text file to add their word classes which is why i asked for a lists of lists of lists.
Steps:
1) Use .append() to add on word classes for each word
2) Use "\t".join() to connect a line together
3) Use "\n".join() to connect different lines in an ID
4) "\n\n".join() to connect all the IDs together into a string
Output:
ID: 00000001
SENT: to do something
to 01573831 prep
do 02017283 verb
something 03517283 noun
ID: 00000002
SENT: just an example
just 06482823 adverb
an 01298744 ind-art
example 01724894 noun
A more pythonic version of Thorsten's answer:
from collections import namedtuple
class Element(namedtuple("ElementBase", "id sent words")):
#classmethod
def parse(cls, source):
lines = source.split("\n")
return cls(
id=lines[0][4:],
sent=lines[1][6:],
words=dict(
line.split("\t") for line in lines[2:]
)
)
text = """ID: 00000001
SENT: to do something
to\t01573831
do\t02017283
something\t03517283
ID: 00000002
SENT: just an example
just\t06482823
an\t01298744
example\t01724894"""
elements = [Element.parse(part) for part in text.split("\n\n")]
for el in elements:
print el
print el.id
print el.sent
print el.words
print
I'd regard every part of the topmost split as an "object". Thus, I'd create a class with properties corresponding to each part.
class Element(object):
def __init__(self, source):
lines = source.split("\n")
self._id = lines[0][4:]
self._sent = lines[1][6:]
self._words = {}
for line in lines[2:]:
word, id_ = line.split("\t")
self._words[word] = id_
#property
def ID(self):
return self._id
#property
def sent(self):
return self._sent
#property
def words(self):
return self._words
def __str__(self):
return "Element %s, containing %i words" % (self._id, len(self._words))
text = """ID: 00000001
SENT: to do something
to\t01573831
do\t02017283
something\t03517283
ID: 00000002
SENT: just an example
just\t06482823
an\t01298744
example\t01724894"""
elements = [Element(part) for part in text.split("\n\n")]
for el in elements:
print el
print el.ID
print el.sent
print el.words
print
In the main code (one line, the list comprehension) the text is only split at each double new-line. Then, all logic is deferred into the __init__ method, making it very local.
Using a class also gives you the benefit of __str__, allowing you control over how your objects are printed.
You could also consider rewriting the last three lines of __init__ to:
self._words = dict([line.split("\t") for line in lines[2:]])
but I wrote a plain loop as it seemed to be easier to understand.
Using a class also gives you the
I'm not sure exactly what output you need but you can adjust this to fit your needs (This uses the itertools grouper recipe):
>>> from itertools import izip_longest
>>> def grouper(n, iterable, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx
args = [iter(iterable)] * n
return izip_longest(fillvalue=fillvalue, *args)
>>> with open('text.txt') as f:
print [[x.rstrip().split(None, 1) for x in g if x.rstrip()]
for g in grouper(6, f, fillvalue='')]
[[['ID:', '00000001'], ['SENT:', 'to do something'], ['to', '01573831'], ['do', '02017283'], ['something', '03517283']],
[['ID:', '00000002'], ['SENT:', 'just an example'], ['just', '06482823'], ['an', '01298744'], ['example', '01724894']]]
would this work for you?:
Top - level (which you have done)
def get_parent(text, parent):
"""recursively walk through text, looking for 'ID' tag"""
# find open_ID and close_ID
open_ID = text.find('ID')
close_ID = text.find('ID', open_ID + 1)
# if there is another instance of 'ID', recursively walk again
if close_ID != -1:
parent.append(text[open_ID : close_ID])
return get_parent(text[close_ID:], parent)
# base-case
else:
parent.append(text[open_ID:])
return
Second - level: split by newlines:
def child_split(parent):
index = 0
while index < len(parent):
parent[index] = parent[index].split('\n')
index += 1
Third - level: split the 'ID' and 'SENT' fields
def split_field(parent, index):
if index < len(parent):
child = 0
while child < len(parent[index]):
if ':' in parent[index][child]:
parent[index][child] = parent[index][child].split(':')
else:
parent[index][child] = parent[index][child].split()
child += 1
return split_field(parent, index + 1)
else:
return
Running it all together:
def main(text):
parent = []
get_parent(text, parent)
child_split(parent)
split_field(parent, 0)
The result is quite nested, perhaps it can be cleaned up somewhat? Or perhaps the split_fields() function could return a dictionary?
Appologies for the really long drawn out question.
I am trying to read in a config file and get a list of rules out.
I have tried to use ConfigParser to do this but it is not a standard config file.
The file contains no section header and no token.
i.e.
config section a
set something to something else
config subsection a
set this to that
next
end
config firewall policy
edit 76
set srcintf "There"
set dstintf "Here"
set srcaddr "all"
set dstaddr "all"
set action accept
set schedule "always"
set service "TCP_5600"
next
edit 77
set srcintf "here"
set dstintf "there"
set srcaddr "all"
set dstaddr "all"
set action accept
set schedule "always"
set service "PING"
next
end
As I couldn't work out how to get ConfigParser to work I thought I would try to iterate through the file, unfortunately I don't have much programming skill so I have got stuck.
I really think I am making this more complicated than it should be.
Here's the code I have written;
class Parser(object):
def __init__(self):
self.config_section = ""
self.config_header = ""
self.section_list = []
self.header_list = []
def parse_config(self, fields): # Create a new section
new_list = []
self.config_section = " ".join(fields)
new_list.append(self.config_section)
if self.section_list: # Create a sub section
self.section_list[-1].append(new_list)
else: self.section_list.append(new_list)
def parse_edit(self, line): # Create a new header
self.config_header = line[0]
self.header_list.append(self.config_header)
self.section_list[-1].append(self.header_list)
def parse_set(self, line): # Key and values
key_value = {}
key = line[0]
values = line[1:]
key_value[key] = values
if self.header_list:
self.header_list.append(key_value)
else: self.section_list[-1].append(key_value)
def parse_next(self, line): # Close the header
self.config_header = []
def parse_end(self, line): # Close the section
self.config_section = []
def parse_file(self, path):
with open(path) as f:
for line in f:
# Clean up the fields and remove unused lines.
fields = line.replace('"', '').strip().split(" ")
if fields[0] == "set":
pass
elif fields[0] == "end":
pass
elif fields[0] == "edit":
pass
elif fields[0] == "config":
pass
elif fields[0] == "next":
pass
else: continue
# fetch and call method.
method = fields[0]
parse_method = "parse_" + method
getattr(Parser, parse_method)(self, fields[1:])
return self.section_list
config = Parser().parse_file('test_config.txt')
print config
The output I am looking for is something like the following;
[['section a', {'something': 'to something else'}, ['subsection a', {'this': 'to that'}]],['firewall policy',['76',{'srcintf':'There'}, {'dstintf':'Here'}{etc.}{etc.}]]]
and this is what I get
[['section a']]
EDIT
I have changed the above to reflect where I am currently at.
I am still having issues getting the output I expect. I just can't seem to get the list right.
class Parser(object):
def __init__(self):
self.my_section = 0
self.flag_section = False
# ...
def parse_config(self, fields):
self.my_section += 1
# go on with fields
# ...
self.flag_section = True
def parse_edit(self, line):
...
def parse_set(self, line):
...
def parse_end(self, line):
...
def parse_file(self, path):
with open(path) as f:
for line in f:
fields = f.strip().split(" ")
method = fields[0]
# fetch and call method
getattr(Parser, "parse_" + method)(self, fields[1:])
I post my answer for people who first come here from Google when trying to parse Fortigate configuration file !
I rewrote what I found here based on my own needs and it works great.
from collections import defaultdict
from pprint import pprint
import sys
f = lambda: defaultdict(f)
def getFromDict(dataDict, mapList):
return reduce(lambda d, k: d[k], mapList, dataDict)
def setInDict(dataDict, mapList, value):
getFromDict(dataDict, mapList[:-1])[mapList[-1]] = value
class Parser(object):
def __init__(self):
self.config_header = []
self.section_dict = defaultdict(f)
def parse_config(self, fields): # Create a new section
self.config_header.append(" ".join(fields))
def parse_edit(self, line): # Create a new header
self.config_header.append(line[0])
def parse_set(self, line): # Key and values
key = line[0]
values = " ".join(line[1:])
headers= self.config_header+[key]
setInDict(self.section_dict,headers,values)
def parse_next(self, line): # Close the header
self.config_header.pop()
def parse_end(self, line): # Close the section
self.config_header.pop()
def parse_file(self, path):
with open(path) as f:
gen_lines = (line.rstrip() for line in f if line.strip())
for line in gen_lines:
# pprint(dict(self.section_dict))
# Clean up the fields and remove unused lines.
fields = line.replace('"', '').strip().split(" ")
valid_fields= ["set","end","edit","config","next"]
if fields[0] in valid_fields:
method = fields[0]
# fetch and call method
getattr(Parser, "parse_" + method)(self, fields[1:])
return self.section_dict
config = Parser().parse_file('FGT02_20130308.conf')
print config["system admin"]["admin"]["dashboard-tabs"]["1"]["name"]
print config["firewall address"]["ftp.fr.debian.org"]["type"]
I do not know if this can help you too, but it did for me : http://wiki.python.org/moin/ConfigParserExamples
Have fun !
I would do it in a simpler way:
flagSection = False
flagSub = False
mySection = 0
mySubsection = 0
myItem = 0
with open('d:/config.txt', 'r') as f:
gen_lines = (line.rstrip() for line in f if line.strip())
for line in gen_lines:
if line[0:7]=='config ':
mySection = mySection + 1
newLine = line[7:]
# Create a new section
# Mark section as open
flagSection == True
elif line[0:5]=='edit '):
mySubsection = mySubsection + 1
newLine = line[5:]
# Create a new sub-section
# Mark subsection as open
flagSub == true
elif line[0:4]=='set '):
myItem = myItem + 1
name, value = x.split(' ',2)[1:]
# Add to whatever is open
elif line=='end':
# If subsection = open then close and goto end
if flagSub:
# Or if section = open then close and goto end
elif flagSection:
# :End
continue
The instruction gen_lines = (line.rstrip() for line in f if line.strip())
creates a generator of not empty lines (thanks to the test if line.strip()) without newline and without blanks at the right (thanks to line.rstrip())
.
If I would know more about the operations you want to perform with name,value and in the section opened with if line=='end' , I could propose a code using regexes.
Edit
from time import clock
n = 1000000
print 'Measuring times with clock()'
te = clock()
for i in xrange(n):
x = ('abcdfafdf'[:3] == 'end')
print clock()-te,
print "\tx = ('abcdfafdf'[:3] == 'end')"
te = clock()
for i in xrange(n):
x = 'abcdfafdf'.startswith('end')
print clock()-te,
print "\tx = 'abcdfafdf'.startswith('end')"
print '\nMeasuring times with timeit module'
import timeit
ti = timeit.repeat("x = ('abcdfafdf'[:3] == 'end')",repeat=10,number = n)
print min(ti),
print "\tx = ('abcdfafdf'[:3] == 'end')"
to = timeit.repeat("x = 'abcdfafdf'.startswith('end')",repeat=10,number = n)
print min(to),
print "\tx = 'abcdfafdf'.startswith('end')"
result:
Measuring times with clock()
0.543445605517 x = ('abcdfafdf'[:3] == 'end')
1.08590449345 x = 'abcdfafdf'.startswith('end')
Measuring times with timeit module
0.294152748464 x = ('abcdfafdf'[:3] == 'end')
0.901923289133 x = 'abcdfafdf'.startswith('end')
Is the fact the times are smaller with timieit than with clock() due to the fact that the GC is unplugged when the program is run ? Anyway, with either clock() or timeit module , executing startswith() takes more time than slicing.