I'm writing a program to extract some data from txt files with regular expressions.
I'm new in OOP and want to save reiterative code. I want to retrieve about 15 data in each txt file, so I wrote a Class definition for each data. The patters to match can come in several formats, so I'll need to try several regex patters. By now, I only implements one regex patterns by data, but in future I need to try more in order to match the specific format used in that txt file, I plan to use a list with de patterns for each data.
I've just wrote 3 classes, but I've realized that I'm repeating too much code. So, I believe that I'm doing something wrong.
import re
import os
import glob
import csv
class PropertyNumber(object):
pattern_text = "(?<=FINCA Nº: )\w{3,6}"
regex_pattern = re.compile(pattern_text)
def __init__(self, str):
self.text_to_search = str
self.text_found = ""
def search_p_number(self):
matched_p_number = PropertyNumber.regex_pattern.search(self.text_to_search)
print(matched_p_number)
self.text_found = matched_p_number.group()
return self.text_found
class PropertyCoefficient(object):
pattern_text = "(?<=Participación: )[0-9,]{1,8}"
regex_pattern = re.compile(pattern_text)
def __init__(self, str):
self.text_to_search = str
self.text_found = ""
def search_p_coefficient(self):
matched_p_coefficient = PropertyCoefficient.regex_pattern.search(self.text_to_search)
print(matched_p_coefficient)
self.text_found = matched_p_coefficient.group()
return self.text_found
class PropertyTaxIDNumber(object):
pattern_text = "(?<=Referencia Catastral: )\d{7}[A-Z]{2}\d{4}[A-Z]\d{4}[A-Z]{2}"
regex_pattern = re.compile(pattern_text)
def __init__(self, str):
self.text_to_search = str
self.text_found = ""
def search_tax_id(self):
matched_p_taxidnumber = PropertyTaxIDNumber.regex_pattern.search(self.text_to_search)
print(matched_p_taxidnumber)
self.text_found = matched_p_taxidnumber.group()
return self.text_found
def scan_txt_report(fli):
data_retrieved = []
file_input = open(fli, mode='r', encoding='utf-8')
property_report = file_input.read()
property_number = PropertyNumber(property_report)
data_retrieved.append(property_number.search_p_number())
property_coefficient = PropertyCoefficient(property_report)
data_retrieved.append(property_coefficient.search_p_coefficient())
property_tax_id_number = PropertyTaxIDNumber(property_report)
data_retrieved.append(property_tax_id_number.search_tax_id())
return data_retrieved
def main():
if os.path.exists("./notas_simples/ns_txt"):
os.chdir("./notas_simples/ns_txt")
list_txt_files = glob.glob("*.txt")
print(list_txt_files)
with open("..\..\listado_de_fincas.csv", mode='w', newline='') as fiout:
file_writer = csv.writer(fiout, delimiter=';')
for file_name_input in list_txt_files:
data_line = scan_txt_report(file_name_input)
file_writer.writerow(data_line)
if __name__ == '__main__':
main()
# TODO Idufir: "(?<=IDUFIR: )\d{14}"
# TODO calle: "(?<=Calle ).*" Break down in street name and number of address
# TODO piso: "(?<=piso ).*," Break down in floor number and door number (or letter), without boundaries
# TODO titularidad: "(?<=TITULARIDAD\n\n).*" Break down in owner name, VAT number, % and domai type.
As you can see above, the 3 classes I've already wrote: PropertyNumber(object), PropertyCoefficient(object) and PropertyTaxIDNumber(object), has a lot of repeated code. Thus, when I add some regex patterns to each class will be worse.
Yes, you are repeating much of your code, and yes, it is a sign of a weak design. I'll take this as an OOP exercise, because this is an overkill.
First, we can see that the only difference between the different classes is their essence, and their regex pattern. So we can have a base class which handles all the repetitive code. Now each subclass simply handles the different pattern:
class BaseProperty(object):
def __init__(self, search_str, pattern):
self.text_to_search = search_str
self.text_found = ""
self.regex_pattern = re.compile(pattern)
def search_property(self):
matched_property = self.regex_pattern.search(self.text_to_search)
print(matched_property)
self.text_found = matched_property.group()
return self.text_found
class PropertyNumber(BaseProperty):
def __init__(self, search_str):
super(PropertyNumber, self).__init__(search_str, "(?<=FINCA Nº: )\w{3,6}")
class PropertyCoefficient(BaseProperty):
def __init__(self, search_str):
super(PropertyCoefficient, self).__init__(search_str, "(?<=Participación: )[0-9,]{1,8}")
Second, it doesn't appear that you're actually using the self.text_found field, so why store it? Now you can init all the properties in a single place, and make your scan_txt_report much simpler.
class BaseProperty(object):
def __init__(self, pattern):
self.regex_pattern = re.compile(pattern)
def search_property(self, search_str):
matched_property = self.regex_pattern.search(search_str)
print(matched_property)
return matched_property.group()
...
class PropertyCoefficient(BaseProperty):
def __init__(self):
super(PropertyCoefficient, self).__init__("(?<=Participación: )[0-9,]{1,8}")
properties = [PropertyNumber(), PropertyCoefficient(), ...]
def scan_txt_report(fli):
file_input = open(fli, mode='r', encoding='utf-8')
property_report = file_input.read()
data_retrieved = [prop.search_property(property_report) for prop in properties]
return data_retrieved
And unless you add some specific functionality for each subclass, you can even let go of the specific properties classes, and just do like this:
properties = [BaseProperty("(?<=FINCA Nº: )\w{3,6}"), BaseProperty("(?<=Participación: )[0-9,]{1,8}")]
And one last thing - please see the comment by #JonClements - it's a bad idea to use reserved words (such as str) as variable names.
There is no need for so many classes.It can be done via two classes.
Class Property(object,regex):
#def __init__ ...
#def prepare (This method will prepare return compiled form of regex
Class Search(object,compiled_regex):
#def __init__ ...
#def search ... (same function as now)
def scan_txt_report(fli):
data_retrieved = []
file_input = open(fli, mode='r', encoding='utf-8')
#take a csv containing all the regex.
#for all regex call property and search classes.keep appending results as well.
return data_retrieved
This way the only thing we need to change is the csv.The program remains intact and tested.
For adding new regex's the csv needs to be updated.
Related
Although I've seen similar questions about this on here none have really explained in a way I think applies to me. I'm working on an RPG game in python and I store my character's inventory in a text file. However when I try to return these inventory items as an Item() class object I'm having issues. Each item is stored as: 'level 10 armor of water' or something along these lines. They are stored as the item's name which contains all the information needed for the object. --> Item(item_type, item_level, item_element, name). Is there anyway to extract this data needed from the object's name in string form?
#inventory.txt:
['', '', '', 'level 10 armor of water', '', '', '', '', '', '']
#Item() constuctor
class Item(object):
def __init__(self, item_type, item_level, item_element, name):
self.item_type = item_type
self.item_level = item_level
self.item_element = item_element
self.name = name
#Inventory Constructor
class Inventory(object):
item_slot1 = ""
item_slot2 = ""
item_slot3 = ""
item_slot4 = ""
item_slot5 = ""
item_slot6 = ""
item_slot7 = ""
item_slot8 = ""
item_slot9 = ""
item_slot10 = ""
slots = [item_slot1, item_slot2, item_slot3, item_slot4, item_slot5, item_slot6, item_slot7, item_slot8, item_slot9, item_slot10]
I realize this isn't the most efficient way of doing things but all help is appreciated.
You can do this:
with open("inventory.txt", "r") as f:
arr = eval(f.read())
for item_string in arr:
item = Item.from_string(item_string)
Writing Item.from_string is going to be a bit cumbersome though, since the name doesn't appear to lend itself well to parsing (e.g. "level 10 armor of water" instead of "level 10|armor|of water" or something easier like that). I'd redesign your storage format, but if that isn't an option, you could use regular expressions, like so:
class Item:
#staticmethod
def from_string(item_string):
level_match = re.match("level (\d+)", item_string)
item_level = level_match.group(1)
type_match = re.match("(armor|sword|backpack|etc)", item_string)
item_type = type_match.group(1)
return Item(item_type, item_level)
Also, you will be executing whatever code is contained in inventory.txt. But since the game is in Python, somebody could just edit the source code for the game itself. Realistically it isn't a problem, imho, but keep it in mind.
Parsing text in this way, instead of using a structured format like json, will lead to problems.
But, in the meantime, you can load the attributes from the str as long as it has a structured/predictable format.
For instance, if we assume that the first two words are the string level followed by the level number then you can use that pattern so long as it's true 100% of the time.
class Item:
def __init__(self, item_type, item_level, item_element, name):
self.item_type = item_type
self.item_level = item_level
self.item_element = item_element
self.name = name
# A #classmethod is good for defining another type of constructor.
# In this example, the #classmethod is what builds the class out of
# the name str.
#classmethod
def load_from_name(cls, name_text):
name_text = name_text.strip() # remove all surrounding whitespace
if not name_text:
return None # the text is empty
words = name_text.split() # split the text into word tokens
if words[0] != "level":
raise ValueError("Must start with 'level'")
try:
level = int(words[1])
except ValueError:
raise ValueError("Second word must be valid int")
# Now we want all of the words before "of" to be the item_type
# and all of the words after "of" to be the element.
if "of" not in words:
raise ValueError("Missing 'of'")
item_type, element = " of ".split(" ".join(words[2:]))
# Finally we assemble the instance and return it
return cls(item_type, level, element, name_text)
Notice how many conditions we have to check for. There's definitely many checks and errors missing. Here's what a structured format looks like:
class Item:
def __init__(self, item_type, item_level, item_element, name):
self.item_type = item_type
self.item_level = item_level
self.item_element = item_element
self.name = name
#classmethod
def load_from_save_state(cls, state):
return cls(state["type"], state["level"], state["element"], state["name"])
Now, the data can be loaded from a json/yaml/whatever structured format super easily.
import json
item_config_json = """
{
"item_type": "water",
"level": 10,
"element": "armor",
"name": "level 10 armor of water"
}
"""
# In a real scenario, this would probably get a path name,
# and the json would contain a list of many dict objects.
def load_item_from_json(json_text):
state = json.loads(json_text)
return Item.load_from_save_state(state)
it does not work. I want to split data as in code in lines attribute.
class movie_analyzer:
def __init__(self,s):
for c in punctuation:
import re
moviefile = open(s, encoding = "latin-1")
movielist = []
movies = moviefile.readlines()
def lines(movies):
for movie in movies:
if len(movie.strip().split("::")) == 4:
a = movie.strip().split("::")
movielist.append(a)
return(movielist)
movie = movie_analyzer("movies-modified.dat")
movie.lines
It returns that:
You can use #property decorator to be able to access the result of the method as a property. See this very simple example of how this decorator might be used:
import random
class Randomizer:
def __init__(self, lower, upper):
self.lower = lower
self.upper = upper
#property
def rand_num(self):
return random.randint(self.lower, self.upper)
Then, you can access it like so:
>>> randomizer = Randomizer(0, 10)
>>> randomizer.rand_num
5
>>> randomizer.rand_num
7
>>> randomizer.rand_num
3
Obviously, this is a useless example; however, you can take this logic and apply it to your situation.
Also, one more thing: you are not passing self to lines. You pass movies, which is unneeded because you can just access it using self.movies. However, if you want to access those variables using self you have to set (in your __init__ method):
self.movielist = []
self.movies = moviefile.readlines()
To call a function you use movie.lines() along with the argument. What you are doing is just accessing the method declaration. Also, make sure you use self as argument in method definitions and save the parameters you want your Object to have. And it is usually a good practice to keep your imports at the head of the file.
import re
class movie_analyzer:
def __init__(self,s):
for c in punctuation:
moviefile = open(s, encoding = "latin-1")
self.movielist = []
self.movies = moviefile.readlines()
#property
def lines(self):
for movie in self.movies:
if len(movie.strip().split("::")) == 4:
a = movie.strip().split("::")
self.movielist.append(a)
return self.movielist
movie = movie_analyzer("movies-modified.dat")
movie.lines()
I am trying to create a program where I can store NPCs in a game with certain attributes. Such as: Faction, personality, interests/hobbies. To achieve this, I have created an NPC class.
class NPC: #name, faction, position/job, character, interests, other
def __init__ (self, name, faction, pos, char, inter, misc):
self.name = name
self.faction = faction
self.pos = pos
self.char = char
self.inter = inter
self.misc = misc
I have created various functions for this program, such as creating new ones, changing certain attributes on NPCs, removing them, printing them and sorting them. To store the NPCs, I append them to a list named "NPClist". I would like to know how to save this list to a .text file or something. So far I have tried the pickle module but that doesn't seem to work. (Sourced from: How to save a list to a file and read it as a list type?)
with open("NPCs.text", "wb") as file:
pickle.dump(NPClist, file)
with open("NPCs.text", "rb") as file:
NPClist.append(pickle.load(file))
I have put the bottom one at the top of the program so that it will load it when the program is launched and the top one at the top of a loop so that it will save frequently. When I try starting the program I get an error message.
AttributeError: Can't get attribute 'NPC' on <module '__main__' (built-in)>
Is there another way to solve this problem or am I just doing pickle the wrong way?
If all you need are the attributes I would suggest just saving the attributes instead of trying to save the entire object, and make this process easier with some helper methods in NPC.
For instance:
class NPC:
def dump(self):
return [self.name, self.faction, self.pos, self.char, self.inter, self.misc]
#staticmethod
def build_npc(attributes):
return NPC(*attributes)
And then you may deal with dumping like so:
NPClist = [NPC(...), NPC(...) ... ]
with open("NPCs.text", "wb") as file:
pickle.dump([i.dump() for i in NPClist], file)
And loading like so:
with open("NPCs.text", "rb") as file:
NPClist = [NPC.build_npc(attributes) for attributes in pickle.load(file)]
class NPC: #name, faction, position/job, character, interests, other
def __init__ (self, name, faction, pos, char, inter, misc):
self.name = name
self.faction = faction
self.pos = pos
self.char = char
self.inter = inter
self.misc = misc
NPCList = []
handsome_npc = NPC(name='n1c9', faction='Good People', pos='Developer',
char='', inter='', misc='')
# create other NPCs as needed
NPCList.append(handsome_npc)
with open('NPCS.text', 'w') as f:
f.write('name,faction,pos\n')
# add other attrs as wanted
for npc in NPCList:
f.write(f"{npc.name}, {npc.faction}, {npc.pos}")
# add other attrs as wanted
f.write('\n')
Tried to write something that's accessible to a beginner - might be a little verbose because of that. Mark Tyler's answer is really good, too!
re: comment - you could access the file afterwards like so:
class NPC: #name, faction, position/job, character, interests, other
def __init__ (self, name, faction, pos, char, inter, misc):
self.name = name
self.faction = faction
self.pos = pos
self.char = char
self.inter = inter
self.misc = misc
npclist_built_from_file = []
with open('NPCS.text', 'r') as f:
NPCS_lines = f.readlines()
for line in NPCS_lines[1:]: # skip the header line
npc = NPC(name=line[0], faction=line[1], pos=line[2], char='', inter='', misc='')
# I used empty strings for char/inter/misc because they were empty in the original
# example, but you would just fill out line[3], line[4], line[5] for the rest if wanted.
npclist_built_from_file.append(npc)
Then you could do whatever you wanted with the NPC objects in the list npclist_built_from_file
import ast
def stringifyNPC(c):
return str(c.__dict__)
def unStringifyNPC(s):
n = NPC(None,None,None,None,None,None)
n.__dict__ = ast.literal_eval(s)
return n
Trying to split up and tokenize a poem (or haiku in this case), which is more of a way to teach myself how to use nltk and classes than anything else. When I run the code below, I get a Name Error: name 'psplit' is not defined even though (my thinking is) that it's defined when I return it from the split function. Can anyone help me figure out what's going wrong under the hood here?
import nltk
poem = "In the cicada's cry\nNo sign can foretell\nHow soon it must die"
class Intro():
def __init__(self, poem):
self.__poem = poem
def split(self):
psplit = (poem.split('\n'))
psplit = str(psplit)
return psplit
def tokenizer(self):
t = nltk.tokenize(psplit)
return t
i = Intro(poem)
print(i.split())
print(i.tokenizer())
There are some issues in your code:
In the split method you have to use self.__poem to access the the poem attribute of your class - as you did in the constructor.
The psplit variable in the split method is only a local variable so you can just use it in this method and nowhere else. If you want to make the variable available in the tokenize method you have to either pass it as an argument or store it as an additional attribute:
...
def tokenizer(self, psplit):
t = nltk.tokenize(psplit)
return t
...
psplit = i.split()
print(i.tokenizer(psplit))
Or:
def __init__(self, poem):
...
self._psplit = None
...
def split(self):
self._psplit = (poem.split('\n'))
self._psplit = str(psplit)
def tokenizer(self):
t = nltk.tokenize(self._psplit)
return t
...
i.split()
print(i.tokenizer())
In addition make sure your indentation is correct.
I am trying to write a function which cleans up URLs (strips them of anything like "www.", "http://" etc.) to create a list that I can sort alphabetically.
I have tried to do this by creating a class including a method to detect the term I would like to remove from the URL-string, and remove it. The bit where I am struggling is that I want to add the modified URLs to a new list called new_strings, and then use that new list when I call the method for a second time on a different term, so that step by step I can remove all unwanted elements from the URL-string.
For some reason my current code returns an empty list, and I am also struggling to understand whether new_strings should be passed to __init__ or not? I guess I am a bit confused with global vs. local variables, and some help and explanation would be greatly appreciated. :)
Thanks! Code below.
class URL_Cleaner(object):
def __init__(self, old_strings, new_strings, term):
self.old_strings = old_strings
self.new_strings = new_strings
self.term = term
new_strings = []
def delete_term(self, new_strings):
for self.string in self.old_strings:
if self.term in string:
new_string = string.replace(term, "")
self.new_strings.append(new_string)
else:
self.new_strings.append(string)
return self.new_strings
print "\n" .join(new_strings) #for checking; will be removed later
strings = ["www.google.com", "http://www.google.com", "https://www.google.com"]
new_strings = []
www = URL_Cleaner(strings, new_strings, "www.")
Why are we making a class to do this?
for string in strings:
string.replace("www.","")
Isn't that what you're trying to accomplish?
Regardless the problem is in your class definition. Pay attention to scopes:
class URL_Cleaner(object):
def __init__(self, old_strings, new_strings, term):
"""These are all instance objects"""
self.old_strings = old_strings
self.new_strings = new_strings
self.term = term
new_strings = [] # this is a class object
def delete_term(self, new_strings):
"""You never actually call this function! It never does anything!"""
for self.string in self.old_strings:
if self.term in string:
new_string = string.replace(term, "")
self.new_strings.append(new_string)
else:
self.new_strings.append(string)
return self.new_strings
print "\n" .join(new_strings) #for checking; will be removed later
# this is referring the class object, and will be evaluated when
# the class is defined, NOT when the object is created!
I've commented your code the necessary reasons.... To fix:
class URL_Cleaner(object):
def __init__(self, old_strings):
"""Cleans URL of 'http://www.'"""
self.old_strings = old_strings
cleaned_strings = self.clean_strings()
def clean_strings(self):
"""Clean the strings"""
accumulator = []
for string in self.old_strings:
string = string.replace("http://", "").replace("www.", "")
# this might be better as string = re.sub("http://(?:www.)?", "", string)
# but I'm not going to introduce re yet.
accumulator.append(string)
return accumulator
# this whole function is just:
## return [re.sub("http://(?:www.)?", "", string, flags=re.I) for string in self.old_strings]
# but that's not as readable imo.
You just need to define new_strings as
self.new_strings = []
and remove new_strings argument from the constructor.
The 'new_strings' and 'self.new_strings' are two different lists.