I have a python script in which I have a config file which looks like this:
PROD = 'production'
DEV = 'dev'
ENVIRONMENT = None
and I have a function which gets the wanted environment from a command argument and sets it like:
if sys.argv[1] in [config.PROD, config.DEV]:
config.ENVIRONMENT = sys.argv[1]
I understood it's not good practice when I started importing the config file in multiple files and ENV kept resetting back to None.
So, my question is: what is the best practice is this case
I'm not sure exactly what the best practice is but I like using JSON files. I use the following class as a layer of abstraction for interfacing with the config (properties) file. You can create one JSONPropertiesFile and pass it around your application.
import json
from collections import OrderedDict
import os
from stat import * # ST_SIZE etc
from datetime import datetime
from copy import deepcopy
class JSONPropertiesFileError(Exception):
pass
class JSONPropertiesFile(object):
def __init__(self, file_path, default={}):
self.file_path = file_path
self._default_properties = default
self._validate_file_path(file_path)
def _validate_file_path(self, file_path):
if not file_path.endswith(".json"):
raise JSONPropertiesFileError(f"Must be a JSON file: {file_path}")
if not os.path.exists(file_path):
self.set(self._default_properties)
def set(self, properties):
new_properties = deepcopy(self._default_properties)
new_properties.update(properties)
with open(self.file_path, 'w') as file:
json.dump(new_properties, file, indent=4)
def get(self):
properties = deepcopy(self._default_properties)
with open(self.file_path) as file:
properties.update(json.load(file, object_pairs_hook=OrderedDict))
return properties
def get_file_info(self):
st = os.stat(self.file_path)
res = {
'size':st[ST_SIZE],
'size_str':str(round(st[ST_SIZE]/1000,2)) + ' KB',
'last_mod': datetime.fromtimestamp(st[ST_MTIME]).strftime("%Y-%m-%d")
}
return res
In your case you might use it like this:
file_path = "path/to/your/config/file"
default_properties = {
'PROD': 'production',
'DEV': 'dev',
'ENVIRONMENT': ""
}
config_file = JSONPropertiesFile(file_path, default_properties)
config = config_file.get()
print(config["PROD"])
config["PROD"] = "something else"
config_file.set(config) # save new config
Related
I'm trying to save image urls for individual properties in their respective csv files via feeds export, in order for this to work, the FEEDS csv_path in custom_settings will have to be changed every time a scrapy.Request is yielded in start_requests. Every time a scrapy.Request is yielded, the self.get_csv_path in __init__ is assigned a new csv file path correspondent to the property id, it is then fetched to FEEDS by def get_feeds_csv_path as in the code below. The self.feeds_csv_path in custom_settings doesn't seem to be able to access def get_feeds_csv_path, where is the error here?
import asyncio
from configparser import ConfigParser
import os
import pandas as pd
import scrapy
import requests
import json
class GetpropertyimgurlsSpider(scrapy.Spider):
name = 'GetPropertyImgUrls'
custom_settings = {
"FEEDS": {
self.feeds_csv_path: {
"format": "csv",
"overwrite": True
}
}
}
def __init__(self, *args, **kwargs):
self.feeds_csv_path = None
super(GetpropertyimgurlsSpider, self).__init__(*args, **kwargs)
def start_requests(self):
files = self.get_html_files() # List of html file full paths
for file in files[:2]:
self.feeds_csv_path = self.get_feeds_csv_path(file)
yield scrapy.Request(file, callback=self.parse)
def parse(self, response):
texts = response.xpath("//text()").getall()
text = texts[1]
json_text = json.loads(text)
#print(text)
photos = json_text["#graph"][3]["photo"]
for photo in photos:
yield photo["contentUrl"]
def get_feeds_csv_path(self, html_file_path):
property_id = html_file_path.split("/")[-2].split("_")[1]
feeds_csv_path = f"{html_file_path}/images/Property_{property_id}_ImgSrcs.csv"
return feeds_csv_path
def get_path(self):
config = ConfigParser()
config.read("config.ini") # Location relative to main.py
path = config["scrapezoopla"]["path"]
return path
#Returns a list of html file dirs
def get_html_files(self):
path = self.get_path()
dir = f"{path}/data/properties/"
dir_list = os.listdir(dir)
folders = []
for ins in dir_list:
if os.path.isdir(f"{dir}{ins}") == True:
append_ins = folders.append(ins)
html_files = []
for folder in folders:
html_file = f"{dir}{folder}/{folder}.html"
if os.path.isfile(html_file) == True:
append_html_file = html_files.append(f"file:///{html_file}")
return html_files
The first problem I see is that you are using the self keyword in the namespace scope of your spider class. The self keyword is only available inside of instance methods where you pass the keyword in as the first argument. e.g. def __init__(self...).
Even if self was available it still wouldn't work though, because once you create the custom_settings dictionary, the self.feeds_csv_path is immediately converted to it's string value at runtime, so updating the instance variable would have no effect on the custom_settings propert.
Another issue is that scrapy collects all of the custom settings and stores them internally before the crawl is actually started, and updating the custom_settings dictionary mid crawl might not actually have an effect. I am not certain about that though.
All of that being said, your goal is still achievable. One means that I can think of is by creating the FEEDS dictionary runtime but prior to initiating the crawl and filtering using custom scrapy.Item classes to filter which item belongs to which output.
I have no way of testing it so it might be buggy but here is an example of what I am referring to:
from configparser import ConfigParser
import json
import os
import scrapy
def get_path():
config = ConfigParser()
config.read("config.ini") # Location relative to main.py
path = config["scrapezoopla"]["path"]
return path
#Returns a list of html file dirs
def get_html_files():
path = get_path()
folder = f"{path}/data/properties/"
dir_list = os.listdir(folder)
html_files = []
for ins in dir_list:
if os.path.isdir(f"{folder}{ins}"):
if os.path.isfile(f"{folder}{ins}/{ins}.html"):
html_files.append(f"file:///{folder}{ins}/{ins}.html")
return html_files
def get_feeds_csv_path(self, html_file_path):
property_id = html_file_path.split("/")[-2].split("_")[1]
feeds_csv_path = f"{html_file_path}/images/Property_{property_id}_ImgSrcs.csv"
return feeds_csv_path
def create_custom_item():
class Item(scrapy.Item):
contentUrl = scrapy.Field()
return Item
def customize_settings():
feeds = {}
files = get_html_files()
start_urls = {}
for path in files:
custom_class = create_custom_item()
output_path = get_feeds_csv_path(path)
start_urls[path] = custom_class
feeds[output_path] = {
"format": "csv",
"item_classes": [custom_class],
}
custom_settings = {"FEEDS": feeds}
return custom_settings, start_urls
class GetpropertyimgurlsSpider(scrapy.Spider):
name = 'GetPropertyImgUrls'
custom_settings, start_urls = customize_settings()
def start_requests(self):
for uri, itemclass in self.start_urls.items():
yield scrapy.Request(uri, callback=self.parse, cb_kwargs={'itemclass': itemclass})
def parse(self, response, itemclass):
texts = response.xpath("//text()").getall()
text = texts[1]
json_text = json.loads(text)
photos = json_text["#graph"][3]["photo"]
for photo in photos:
item = itemclass()
item['contentUrl'] = photo["contentUrl"]
yield item
I have a class that is doing a lot of stuff. In the end, it saves everything into a pickle. When I rerun this class I want to read the pickle instead of doing everything again. Unfortunately it the variable is always empty if I unpickle. Why is that so?
import pandas as pd
class Test:
def __init__(path, value):
# path points to a .txt file but its in the same folder as the pickle
data_path, data = os.path.split(path)
pickle_path = os.path.join(data_path, name.split('.')[1] + '.pickle'
if os.path.isfile(pickle_path):
self = pd.read_pickle(path)
else:
# do a ton of stuff and safe it as pickle afterwards
variable = Test(path, value)
In this case variable is empty if I read from pickle but correct if I do all the stuff...
If I want to cache some calculation results I will load/dump the object outside the class, something like,
pickle_path = os.path.join(data_path, name.split('.')[1] + '.pickle'
if os.path.isfile(pickle_path):
with open(pickle_path, 'rb') as f:
variable = pickle.load(f) # use cached results
else:
variable = Test() # do all the calculations
I need to mock pathlib.Path.open using pytest-mock.
The real open_func opens a yaml-file. The return value is a regular dict. How can I mock Path.open to just load another yaml-file called test-config.yaml?
My code is not working properly as conf will simply become a str ("test_config.yaml"). It should be a dict.
from pathlib import Path
import yaml
def open_func():
with Path.open(Path("./config.yaml")) as f:
return yaml.load(f, Loader=yaml.FullLoader)
def test_open_func(mocker):
mocker.patch("pathlib.Path.open", mocker.mock_open(read_data="test_config.yaml"))
conf = open_func()
assert isinstance(conf, dict)
EDIT:
To get closer to my real world problem, I am providing the following code. I have a class TryToMock, that basically takes two files as inputs. The method load_files simply loads these files (which are actually .yaml files) and returns the output. These .yaml files are really some configuration files.
In my unit tests, I will be calling TryToMocknumerous times through pytest's parametrize. Therefore, I would like to load the original configuration files via a fixture. Then I am able to monkeypatch some entries in my various tests before running load_files.
In order not to load the original files again, I need to mock the Path.open function in TryToMock. I would like to pass the monkeypatched yaml files instead (i.e. in the form of a dict). The difficulty is that I must discriminate between the two files. That is I can't simply mock the Path.open function with the same file content.
# TryToMock.py
from pathlib import Path
import yaml
# In my current working folder, I have to .yaml files containing the following
# content for illustrative purpose:
#
# file1.yaml = {'name': 'test1', 'file_type': 'yaml'}
# file2.yaml = {'schema': 'test2', 'currencies': ['EUR', 'USD', 'JPY']}
class TryToMock:
def __init__(self, file_to_mock_1, file_to_mock_2):
self._file_to_mock_1 = file_to_mock_1
self._file_to_mock_2 = file_to_mock_2
def load_files(self):
with Path.open(self._file_to_mock_1) as f:
file1 = yaml.load(f, Loader=yaml.FullLoader)
with Path.open(self._file_to_mock_2) as f:
file2 = yaml.load(f, Loader=yaml.FullLoader)
return file1, file2
# test_TryToMock.py
import os
from pathlib import Path
import pytest
import yaml
from tests import TryToMock
def yaml_files_for_test(yaml_content):
names = {"file1.yaml": file1_content, "file2.yaml": file2_content}
return os.path.join("./", names[os.path.basename(yaml_content)])
#pytest.fixture(scope="module")
def file1_content():
with Path.open(Path("./file1.yaml")) as f:
return yaml.load(f, Loader=yaml.FullLoader)
#pytest.fixture(scope="module")
def file2_content():
with Path.open(Path("./file2.yaml")) as f:
return yaml.load(f, Loader=yaml.FullLoader)
def test_try_to_mock(file1_content, file2_content, monkeypatch, mocker):
file_1 = Path("./file1.yaml")
file_2 = Path("./file2.yaml")
m = TryToMock.TryToMock(file_to_mock_1=file_1, file_to_mock_2=file_2)
# Change some items
monkeypatch.setitem(file1_content, "file_type", "json")
# Mocking - How does it work when I would like to use mock_open???
# How should the lambda function look like?
mocker.patch(
"pathlib.Path.open",
lambda x: mocker.mock_open(read_data=yaml_files_for_test(x)),
)
files = m.load_files()
assert files[0]["file_type"] == "json"
You have to provide the actual file contents to the read_data argument of mock_open. You can just create the data in your test:
test_yaml = """
foo:
bar:
- VAR: "MyVar"
"""
def test_open_func(mocker):
mocker.patch("pathlib.Path.open", mocker.mock_open(read_data=test_yaml))
conf = open_func()
assert conf == {'foo': {'bar': [{'VAR': 'MyVar'}]}}
Or you can read the data from your test file:
def test_open_func(mocker):
with open("my_fixture_path/test.yaml") as f:
contents = f.read()
mocker.patch("pathlib.Path.open", mocker.mock_open(read_data=contents))
conf = open_func()
assert isinstance(conf, dict)
The last case can be also re-written to replace the path argument in the open call by your test path:
def test_open_func(mocker):
mocker.patch("pathlib.Path.open", lambda path: open("test.yaml"))
conf = open_func()
assert isinstance(conf, dict)
or, if you have different test files for different configs, something like:
def yaml_path_for_test(yaml_path):
names = {
"config.yaml": "test.yaml",
...
}
return os.path.join(my_fixture_path, names[os.path.basename(yaml_path)])
def test_open_func3(mocker):
mocker.patch("pathlib.Path.open", lambda path: open(yaml_path_for_test(path)))
conf = open_func()
assert isinstance(conf, dict)
This is probably what you wanted to achieve in your test code.
UPDATE:
This is related to the second part of the question (after the edit). If you have the module-scoped fixtures that preload the fixture files as in the question, you can do something like this:
def test_open_func(mocker, file1_content, file2_content):
def yaml_files_for_test(path):
contents = {"file1.yaml": file1_content,
"file2.yaml": file2_content}
data = contents[os.path.basename(path)]
mock = mocker.mock_open(read_data=yaml.dump(data))
return mock.return_value
mocker.patch("pathlib.Path.open", yaml_files_for_test)
conf = open_func()
assert isinstance(conf, dict)
or, if you prefer not to use nested functions:
def yaml_files_for_test(path, mocker, content1, content2):
contents = {"file1.yaml": content1,
"file2.yaml": content2}
data = contents[os.path.basename(path)]
mock = mocker.mock_open(read_data=yaml.dump(data))
return mock.return_value
def test_open_func5(mocker, file1_content, file2_content):
mocker.patch("pathlib.Path.open",
lambda path: yaml_files_for_test(path, mocker,
file2_content, file2_content))
conf = open_func()
assert isinstance(conf, dict)
I see there are a few of these questions on here, but I haven't found one that quit matches what I'm after.
I have a common file, lets call it tools.py. In this file I have a host of path definitions to use and an init_paths function to set some key paths based on command line arguments:
def init_paths(args):
global tools_dir, tools_src, tools_bin
if args.tools_set:
tools_dir = os.path.realpath(os.path.join(args.tools_set,"tools"))
else:
tools_dir = os.path.join(BAR_PATH, "tools")
FOO_PATH = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), ".."))
BAR_PATH = os.path.join(FOO_PATH, "foobar")
tools_dir = none
tools_src = none
tools_bin = none
etc...
I have a main file, lets call it main.py where I want to use these.
if __name__ == "__main__":
args = parseArgs()
from tools import init_paths
init_paths(args)
doStuffFunction(args.one, args.two, args.three)
I have left out the meat and potatoes to be sure, but I believe this should be enough to illustrate my global scope problem. when I run this: python main.py --tools-set=/path/to/tools, I am expecting the call to init_paths, to set up some key paths I wish to use later in the doStuffFunction().
def doStuffFunction():
searchPath = os.path.join(tools_dir, "folder")
this fails: AttributeError: 'NoneType' object has no attribute endswith
pretty sure this is because it is not getting set. but why?
EDIT
main.py
#!/usr/bin/env python
import sys
import os
import argparse
import glob
from tools import *
def parseArgs():
parser = argparse.ArgumentParser(description="parse my args")
parser.add_argument("--toolchain-root", type=str,default=None,help='specify toolchain directory')
args = parser.parse_args()
return args
def doStuffFunction():
output = 'output'
if not os.path.isdir(output):
os.makedirs(output)
gimmySugar(output)
def gimmySugar(output):
fileList = []
linkBook= {}
searchPath = os.path.join(tools_BIN_ROOT,'gcc-4.8.5')
for root, dirs, files in os.walk(searchPath):
for libFile in glob.glob(root+'/*.so*'):
fileList.append(libFile)
if os.path.islink(libFile):
linksWith = os.readlink(libFile)
linkBook[libFile] = linksWith
if __name__ == "__main__":
# script was called directly from the command line
args = parseArgs()
from tools import init_settings
init_settings(args)
doStuffFunction()
tools.py
import os
def init_settings(args):
global tools_DIR, tools_SRC_ROOT, tools_OBJ_ROOT, tools_BIN_ROOT
if args.toolchain_root:
tools_DIR = os.path.realpath(os.path.join(args.toolchain_root, "toolchain"))
else:
tools_DIR = os.path.join(USER_DIR, "")
tools_SRC_ROOT = os.path.join(tools_DIR, "src")
tools_OBJ_ROOT = os.path.join(tools_DIR, "obj")
tools_BIN_ROOT = os.path.join(tools_DIR, "bin")
ROOT_PATH = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), ".."))
OUTPUT_PATH = os.path.join(ROOT_PATH, "outputs")
BUILD_PATH = os.path.join(OUTPUT_PATH, "build")
USER_DIR = "/usr/lib64/"
tools_DIR = None
tools_SRC_ROOT = None
tools_OBJ_ROOT = None
tools_BIN_ROOT = None
#This line will failed
searchPath = os.path.join(tools_BIN_ROOT,'gcc-4.8.5')
The scope of global variables is module level, tools_BIN_ROOT will not be shared without passing across modules.
The variable tools_BIN_ROOT is global variable ONLY in tools.py. Insteadly, main.py do not contain any global variable in tools.py.
For inspecting this, you can use print(globals()) in both files.
Do not use global variable if possible.
This is a simple workaround.
(I strongly recommend you to refactor your code by config or OOP)
tools.py
def get_tools_BIN_ROOT():
return tools_BIN_ROOT
os.py
from tools import get_tools_BIN_ROOT
searchPath = os.path.join(get_tools_BIN_ROOT(),'gcc-4.8.5')
I have a python script which is controlled by a config file called system.config .the structure of config file is like bellow with some default values.
[company]
companyname: XYZ
[profile]
name: ABC
joining: 1/1/2014
the code for config file is : config_parser_details.py
import ConfigParser
import sys
Config = ConfigParser.ConfigParser()
Config.read("system.config")
filename = "system.config"
def ConfigSectionMap(section):
dict1 = {}
options = Config.options(section)
for option in options:
try:
dict1[option] = Config.get(section, option)
if dict1[option] == -1:
DebugPrint("skip: %s" % option)
except:
print("exception on %s!" % option)
dict1[option] = None
return dict1
company = ConfigSectionMap("company")['companyname']
name = ConfigSectionMap("profile")['name']
joindate = ConfigSectionMap("profile")['joining']
now the code for my script is : test.py
import config_parser_details as p
import sys
import warnings
import os
company = p.company
name = p.name
date = p.joindate
print("%s\n" %company)
print("%s\n" %name)
output is
XYZ
ABC
now I want to give input in the config file through command line.
like
python test.py --compname ="testing"
if any argument is missing in the command line than default value will be the input.
You could use argparse library to parse command line arguments.
So your test.py file looks like below :
import config_parser_details as p
import sys
import warnings
import os
import argparse
commandLineArgumentParser = argparse.ArgumentParser()
commandLineArgumentParser.add_argument("-c", "--compname", help="Company name", default=p.company)
commandLineArguments = commandLineArgumentParser.parse_args()
company = commandLineArguments.compname
name = p.name
date = p.joindate
print("%s\n" %company)
print("%s\n" %name)
I'd advise looking into a tool like docopt.
For a quick fix though, you can try doing this
def ConfigSectionMap(section):
options = Config.options(section)
arg_dict = {}
for command_line_argument in sys.argv[1:]:
arg = command_line_argument.split("=")
arg_dict[arg[0][2:]] = arg[1]
for key in arg_dict:
options[key] = arg_dict[key]
return options
This will load up all the default option. Any options put on the command line will override or add to the options dict.
First of all, I'd move code into a main section so that you can import config_parser_details without executing code:
if __name__ == '__main__':
main()
def main():
Config = ConfigParser.ConfigParser()
Config.read("system.config")
filename = "system.config"
company = ConfigSectionMap("company")['companyname']
name = ConfigSectionMap("profile")['name']
joindate = ConfigSectionMap("profile")['joining']
Secondly, I'd use STB land's suggestion of parsing the command line with argparse, something like:
def main():
# do the parsing thing first, then:
filename = args.filename
do_stuff(filename)
This way you can neatly use python's own unit test framework or nosetests to write test file that don't require you to manually specify parameters:
def test_basic():
# create a temporary file with tempfile.NamedTemporaryFile
tmpfile = tempfile.NamedTemporaryFile()
# add test data to tmpfile
do_stuff(tmpfile)
# check the output
assert ....
This comes with the added benefit of not having global variables, which will complicate your life later.