I have a Flask web app that uses a large DataFrame ( hundreds of Megs). The DataFrame is used in the app for several different machine learning models. I want to create the DataFrame only once in the application and use it across multiple requests so that the user may build different models based on the same data. The Flask session is not built for large data, so that is not an option. I do not want to go back and recreate the DataFrame in case the source of the data is a csv file(yuck). be
I have a solution that works, but I cannot find any discussion of this solution in stack overflow. That makes me suspicious that my solution may not be a good design idea. I have always used the assumption that a well beaten path in software development is a path well chosen.
My solution is simply to create a data holder class with one class variable:
class DataHolder:
dataFrameHolder = None
Now the dataFrameHolder is known across all class instances (like a static variable in Java) since it is stored in memory on the server.
I can now create the DataFrame once, put it into the DataHolder class:
import pandas as pd
from dataholder import DataHolder
result_set = pd.read_sql_query(some_SQL, connection)
df = pd.DataFrame(result_set, columns=['col1', 'col2',....]
DataHolder.dataFrameHolder = df
Then access that DataFrame from any code that imports the DataHolder class. I can then use the stored DataFrame anywhere in the application, including across different requests:
.
.
modelDataFrame = DataHolder.dataFrameHolder
do_some_model(modelDataFrame)
.
.
Is this a bad idea, a good idea, or is there something else that I am not aware of that already solves the problem?
Redis can be used. My use case is smaller data frames so have not tested with larger data frames. This allows me to provide 3 second ticking data to multiple browser clients. pyarrow serialisation / deserialisation is performing well. Works locally and across AWS/GCloud and Azure
GET route
#app.route('/cacheget/<path:key>', methods=['GET'])
def cacheget(key):
c = mycache()
data = c.redis().get(key)
resp = Response(BytesIO(data), mimetype="application/octet-stream", direct_passthrough=True)
resp.headers["key"] = key
resp.headers["type"] = c.redis().get(f"{key}.type")
resp.headers["size"] = sys.getsizeof(data)
resp.headers["redissize"] = sys.getsizeof(c.redis().get(key))
return resp
sample route to put dataframe into cache
#app.route('/sensor_data', methods=['POST'])
def sensor_data() -> str:
c = mycache()
dfsensor = c.get("dfsensor")
newsensor = json_normalize(request.get_json())
newsensor[["x","y"]] = newsensor[["epoch", "value"]]
newsensor["xy"] = newsensor[['x', 'y']].agg(pd.Series.to_dict, axis=1)
newsensor["amin"] = newsensor["value"]
newsensor["amax"] = newsensor["value"]
newsensor = newsensor.drop(columns=["x","y"])
# add new data from serial interface to start of list (append old data to new data).
# default time as now to new data
dfsensor = newsensor.append(dfsensor, sort=False)
# keep size down - only last 500 observations
c.set("dfsensor", dfsensor[:500])
del dfsensor
return jsonify(result={"status":"ok"})
utility class
import pandas as pd
import pyarrow as pa, os
import redis,json, os, pickle
import ebutils
from logenv import logenv
from pandas.core.frame import DataFrame
from redis.client import Redis
from typing import (Union, Optional)
class mycache():
__redisClient:Redis
CONFIGKEY = "cacheconfig"
def __init__(self) -> None:
try:
ep = os.environ["REDIS_HOST"]
except KeyError:
if os.environ["HOST_ENV"] == "GCLOUD":
os.environ["REDIS_HOST"] = "redis://10.0.0.3"
elif os.environ["HOST_ENV"] == "EB":
os.environ["REDIS_HOST"] = "redis://" + ebutils.get_redis_endpoint()
elif os.environ["HOST_ENV"] == "AZURE":
#os.environ["REDIS_HOST"] = "redis://ignore:password#redis-sensorvenv.redis.cache.windows.net"
pass # should be set in azure env variable
elif os.environ["HOST_ENV"] == "LOCAL":
os.environ["REDIS_HOST"] = "redis://127.0.0.1"
else:
raise "could not initialise redis"
return # no known redis setup
#self.__redisClient = redis.Redis(host=os.environ["REDIS_HOST"])
self.__redisClient = redis.Redis.from_url(os.environ["REDIS_HOST"])
self.__redisClient.ping()
# get config as well...
self.config = self.get(self.CONFIGKEY)
if self.config is None:
self.config = {"pyarrow":True, "pickle":False}
self.set(self.CONFIGKEY, self.config)
self.alog = logenv.alog()
def redis(self) -> Redis:
return self.__redisClient
def exists(self, key:str) -> bool:
if self.__redisClient is None:
return False
return self.__redisClient.exists(key) == 1
def get(self, key:str) -> Union[DataFrame, str]:
keytype = "{k}.type".format(k=key)
valuetype = self.__redisClient.get(keytype)
if valuetype is None:
if (key.split(".")[-1] == "pickle"):
return pickle.loads(self.redis().get(key))
else:
ret = self.redis().get(key)
if ret is None:
return ret
else:
return ret.decode()
elif valuetype.decode() == str(pd.DataFrame):
# fallback to pickle serialized form if pyarrow fails
# https://issues.apache.org/jira/browse/ARROW-7961
try:
return pa.deserialize(self.__redisClient.get(key))
except pa.lib.ArrowIOError as err:
self.alog.warning("using pickle from cache %s - %s - %s", key, pa.__version__, str(err))
return pickle.loads(self.redis().get(f"{key}.pickle"))
except OSError as err:
if "Expected IPC" in str(err):
self.alog.warning("using pickle from cache %s - %s - %s", key, pa.__version__, str(err))
return pickle.loads(self.redis().get(f"{key}.pickle"))
else:
raise err
elif valuetype.decode() == str(type({})):
return json.loads(self.__redisClient.get(key).decode())
else:
return self.__redisClient.get(key).decode() # type: ignore
def set(self, key:str, value:Union[DataFrame, str]) -> None:
if self.__redisClient is None:
return
keytype = "{k}.type".format(k=key)
if str(type(value)) == str(pd.DataFrame):
self.__redisClient.set(key, pa.serialize(value).to_buffer().to_pybytes())
if self.config["pickle"]:
self.redis().set(f"{key}.pickle", pickle.dumps(value))
# issue should be transient through an upgrade....
# once switched off data can go away
self.redis().expire(f"{key}.pickle", 60*60*24)
elif str(type(value)) == str(type({})):
self.__redisClient.set(key, json.dumps(value))
else:
self.__redisClient.set(key, value)
self.__redisClient.set(keytype, str(type(value)))
if __name__ == '__main__':
os.environ["HOST_ENV"] = "LOCAL"
r = mycache()
rr = r.redis()
for k in rr.keys("cache*"):
print(k.decode(), rr.ttl(k))
print(rr.get(k.decode()))
I had a similar problem, as I was importing CSVs (100s of MBs) and creating DataFrames on the fly for each request, which as you said was yucky! I also tried the REDIS way, to cache it, and that improved performance for a while, until I realized that making changes to the underlying data meant updating the cache as well.
Then I found a world beyond CSV, and more performant file formats like Pickle, Feather, Parquet, and others. You can read more about them here. You can import/export CSVs all you want, but use intermediate formats for processing.
I did run into some issues though. I have read that Pickle has security issues, even though I still use it. Feather wouldn't let me write some object types in my data, it needed them categorized. Your mileage may vary, but if you have good clean data, use Feather.
And more recently I've found that I manage large data using Datatable instead of Pandas, and storing them in Jay for even better read/write performance.
This does however mean re-writing bits of code that use Pandas into DataTable, but I believe the APIs are very similar. I have not yet done it myself, because of very large codebase, but you can give it a try.
Related
I am running a dask.distributed Client that gets data from an API with several parameters, parses results and joins/aggregates on each result. This is done with client.map()
Sometimes the API call gives an empty string because the specific combination of input parameters doesn't exist. It doesn't make sense to continue with computations and I would like to just kill that worker (without passing on e.g. a None).
How do I tell Dask to kill a worker if its result is None/error and exclude that future from the following operations?
Please let me know if you need more details.
Thanks.
EDIT:
Added a minimal working example to show the logic: the first map produces a lot of "useless" workers that I would like to kill.
Please notice that this is not my actual use case, I am querying an Influx database via http requests but the general structure of the code is the same. I am open to any comments on how to do that faster/more efficiently.
´´´´python
import requests
import numpy as np
import pandas as pd
from dask.distributed import Client, LocalCluster, as_completed
import dask.dataframe as dd
def fetch_html(pair):
req_string = 'https://www.bitstamp.net/api/v2/order_book/{currency_pair}/'
response = requests.get(req_string.format(currency_pair=pair))
try:
result = response.json()
return result
except Exception as e:
print('Error: {}\nMessage: {}'.format(e,response.reason))
return None
def parse_result(result):
if result:
data = {}
data['prices'] = [e[0] for e in result['bids']]
data['vols'] = [e[1] for e in result['bids']]
data['index'] = [result['timestamp'] for i in data['prices']]
df = pd.DataFrame.from_dict(data).set_index('index')
return df
else:
return pd.DataFrame()
def other_calcs(result):
if not result.empty:
# something
return result
else:
return pd.DataFrame()
def aggregator(res1, res2):
if (not res1.empty) and (not res2.empty):
# something
return res1
elif not res2.empty:
# something
return res2
elif not res1.empty:
return res1
else:
return pd.DataFrame()
if __name__=='__main__':
pairs = [
# legit params (100s of these):
'btcusd',
'btceur',
'btcgbp',
'bateur',
'batbtc',
'umausd',
'xrpusdt',
'eurteur',
'eurtusd',
'manausd',
'sandeur',
'storjusd',
'storjeur',
'adausd',
'adaeur',
# bad params resulting in error / empty result (100s of these)
'foobar',
'foobaz',
'foousd',
'barbaz',
'bazbar',
]
cluster = LocalCluster(n_workers=16, threads_per_worker=1)
client = Client(cluster)
futures_list = client.map(fetch_html, pairs)
futures_list = client.map(parse_result, futures_list)
futures_list = client.map(other_calcs, futures_list)
seq = as_completed(futures_list)
while seq.count() > 1:
f1 = next(seq)
f2 = next(seq)
new = client.submit(aggregator, f1, f2, priority=1)
seq.add(new)
final = next(seq)
final = final.result()
print(final.head())
´´´´
The Python API is available to read objects from a cluster. By cloning we can say:
Get a copy of an existing Kubernetes object using kubectl get
Change the properties of the object
Apply the new object
Until recently, the option to --export api was deprecated in 1.14. How can we use the Python Kubernetes API to do the steps from 1-3 described above?
There are multiple questions about how to extract the code from Python API to YAML, but it's unclear how to transform the Kubernetes API object.
Just use to_dict() which is now offered by Kubernetes Client objects. Note that it creates a partly deep copy. So to be safe:
copied_obj = copy.deepcopy(obj.to_dict())
Dicts can be passed to create* and patch* methods.
For convenience, you can also wrap the dict in Prodict.
copied_obj = Prodict.from_dict(copy.deepcopy(obj.to_dict()))
The final issue is getting rid of superfluous fields. (Unfortunately, Kubernetes sprinkles them throughout the object.) I use kopf's internal facility for getting the "essence" of an object. (It takes care of the deep copy.)
copied_obj = kopf.AnnotationsDiffBaseStorage().build(body=kopf.Body(obj.to_dict()))
copied_obj = Prodic.from_dict(copied_obj)
After looking at the requirement, I spent a couple of hours researching the Kubernetes Python API. Issue 340 and others ask about how to transform the Kubernetes API object into a dict, but the only workaround I found was to retrieve the raw data and then convert to JSON.
The following code uses the Kubernetes API to get a deployment and its related hpa from the namespaced objects, but retrieving their raw values as JSON.
Then, after transforming the data into a dict, you can alternatively clean up the data by removing null references.
Once you are done, you can transform the dict as YAML payload to then save the YAML to the file system
Finally, you can apply either using kubectl or the Kubernetes Python API.
Note:
Make sure to set KUBECONFIG=config so that you can point to a cluster
Make sure to adjust the values of origin_obj_name = "istio-ingressgateway" and origin_obj_namespace = "istio-system" with the name of the corresponding objects to be cloned in the given namespace.
import os
import logging
import yaml
import json
logging.basicConfig(level = logging.INFO)
import crayons
from kubernetes import client, config
from kubernetes.client.rest import ApiException
LOGGER = logging.getLogger(" IngressGatewayCreator ")
class IngressGatewayCreator:
#staticmethod
def clone_default_ingress(clone_context):
# Clone the deployment
IngressGatewayCreator.clone_deployment_object(clone_context)
# Clone the deployment's HPA
IngressGatewayCreator.clone_hpa_object(clone_context)
#staticmethod
def clone_deployment_object(clone_context):
kubeconfig = os.getenv('KUBECONFIG')
config.load_kube_config(kubeconfig)
v1apps = client.AppsV1beta1Api()
deployment_name = clone_context.origin_obj_name
namespace = clone_context.origin_obj_namespace
try:
# gets an instance of the api without deserialization to model
# https://github.com/kubernetes-client/python/issues/574#issuecomment-405400414
deployment = v1apps.read_namespaced_deployment(deployment_name, namespace, _preload_content=False)
except ApiException as error:
if error.status == 404:
LOGGER.info("Deployment %s not found in namespace %s", deployment_name, namespace)
return
raise
# Clone the object deployment as a dic
cloned_dict = IngressGatewayCreator.clone_k8s_object(deployment, clone_context)
# Change additional objects
cloned_dict["spec"]["selector"]["matchLabels"]["istio"] = clone_context.name
cloned_dict["spec"]["template"]["metadata"]["labels"]["istio"] = clone_context.name
# Save the deployment template in the output dir
context.save_clone_as_yaml(cloned_dict, "deployment")
#staticmethod
def clone_hpa_object(clone_context):
kubeconfig = os.getenv('KUBECONFIG')
config.load_kube_config(kubeconfig)
hpas = client.AutoscalingV1Api()
hpa_name = clone_context.origin_obj_name
namespace = clone_context.origin_obj_namespace
try:
# gets an instance of the api without deserialization to model
# https://github.com/kubernetes-client/python/issues/574#issuecomment-405400414
hpa = hpas.read_namespaced_horizontal_pod_autoscaler(hpa_name, namespace, _preload_content=False)
except ApiException as error:
if error.status == 404:
LOGGER.info("HPA %s not found in namespace %s", hpa_name, namespace)
return
raise
# Clone the object deployment as a dic
cloned_dict = IngressGatewayCreator.clone_k8s_object(hpa, clone_context)
# Change additional objects
cloned_dict["spec"]["scaleTargetRef"]["name"] = clone_context.name
# Save the deployment template in the output dir
context.save_clone_as_yaml(cloned_dict, "hpa")
#staticmethod
def clone_k8s_object(k8s_object, clone_context):
# Manipilate in the dict level, not k8s api, but from the fetched raw object
# https://github.com/kubernetes-client/python/issues/574#issuecomment-405400414
cloned_obj = json.loads(k8s_object.data)
labels = cloned_obj['metadata']['labels']
labels['istio'] = clone_context.name
cloned_obj['status'] = None
# Scrub by removing the "null" and "None" values
cloned_obj = IngressGatewayCreator.scrub_dict(cloned_obj)
# Patch the metadata with the name and labels adjusted
cloned_obj['metadata'] = {
"name": clone_context.name,
"namespace": clone_context.origin_obj_namespace,
"labels": labels
}
return cloned_obj
# https://stackoverflow.com/questions/12118695/efficient-way-to-remove-keys-with-empty-strings-from-a-dict/59959570#59959570
#staticmethod
def scrub_dict(d):
new_dict = {}
for k, v in d.items():
if isinstance(v, dict):
v = IngressGatewayCreator.scrub_dict(v)
if isinstance(v, list):
v = IngressGatewayCreator.scrub_list(v)
if not v in (u'', None, {}):
new_dict[k] = v
return new_dict
# https://stackoverflow.com/questions/12118695/efficient-way-to-remove-keys-with-empty-strings-from-a-dict/59959570#59959570
#staticmethod
def scrub_list(d):
scrubbed_list = []
for i in d:
if isinstance(i, dict):
i = IngressGatewayCreator.scrub_dict(i)
scrubbed_list.append(i)
return scrubbed_list
class IngressGatewayContext:
def __init__(self, manifest_dir, name, hostname, nats, type):
self.manifest_dir = manifest_dir
self.name = name
self.hostname = hostname
self.nats = nats
self.ingress_type = type
self.origin_obj_name = "istio-ingressgateway"
self.origin_obj_namespace = "istio-system"
def save_clone_as_yaml(self, k8s_object, kind):
try:
# Just try to create if it doesn't exist
os.makedirs(self.manifest_dir)
except FileExistsError:
LOGGER.debug("Dir already exists %s", self.manifest_dir)
full_file_path = os.path.join(self.manifest_dir, self.name + '-' + kind + '.yaml')
# Store in the file-system with the name provided
# https://stackoverflow.com/questions/12470665/how-can-i-write-data-in-yaml-format-in-a-file/18210750#18210750
with open(full_file_path, 'w') as yaml_file:
yaml.dump(k8s_object, yaml_file, default_flow_style=False)
LOGGER.info(crayons.yellow("Saved %s '%s' at %s: \n%s"), kind, self.name, full_file_path, k8s_object)
try:
k8s_clone_name = "http2-ingressgateway"
hostname = "my-nlb-awesome.a.company.com"
nats = ["123.345.678.11", "333.444.222.111", "33.221.444.23"]
manifest_dir = "out/clones"
context = IngressGatewayContext(manifest_dir, k8s_clone_name, hostname, nats, "nlb")
IngressGatewayCreator.clone_default_ingress(context)
except Exception as err:
print("ERROR: {}".format(err))
Not python, but I've used jq in the past to quickly clone something with the small customisations required for each use case (usually cloning secrets into a new namespace).
kc get pod whatever-85pmk -o json \
| jq 'del(.status, .metadata ) | .metadata.name="newname"' \
| kc apply -f - -o yaml --dry-run
This is really easy to do with Hikaru.
Here is an example from my own open source project:
def duplicate_without_fields(obj: HikaruBase, omitted_fields: List[str]):
"""
Duplicate a hikaru object, omitting the specified fields
This is useful when you want to compare two versions of an object and first "cleanup" fields that shouldn't be
compared.
:param HikaruBase obj: A kubernetes object
:param List[str] omitted_fields: List of fields to be omitted. Field name format should be '.' separated
For example: ["status", "metadata.generation"]
"""
if obj is None:
return None
duplication = obj.dup()
for field_name in omitted_fields:
field_parts = field_name.split(".")
try:
if len(field_parts) > 1:
parent_obj = duplication.object_at_path(field_parts[:-1])
else:
parent_obj = duplication
setattr(parent_obj, field_parts[-1], None)
except Exception:
pass # in case the field doesn't exist on this object
return duplication
Dumping the object to yaml afterwards or re-applying it to the cluster is trivial with Hikaru
We're using this to clean up objects so that can show users a github-style diff when objects change, without spammy fields that change often like generation
I'm trying to implement this example:
https://github.com/Azure/azure-documentdb-python/blob/master/samples/DatabaseManagement/Program.py
To fetch data from azure documentdb and do some visualization. However, I would like to use a query on the line where it says #error here instead.
def read_database(client, id):
print('3. Read a database by id')
try:
db = next((data for data in client.ReadDatabases() if data['id'] == database_id))
coll = next((coll for coll in client.ReadCollections(db['_self']) if coll['id'] == database_collection))
return list(itertools.islice(client.ReadDocuments(coll['_self']), 0, 100, 1))
except errors.DocumentDBError as e:
if e.status_code == 404:
print('A Database with id \'{0}\' does not exist'.format(id))
else:
raise errors.HTTPFailure(e.status_code)
The fetching is really slow when I want to get >10k items, how can I improve this?
Thanks!
You can't query documents directly through database entity.
The parameters of the ReadDocuments() method used in your code should be collection link and query options.
def ReadDocuments(self, collection_link, feed_options=None):
"""Reads all documents in a collection.
:Parameters:
- `collection_link`: str, the link to the document collection.
- `feed_options`: dict
:Returns:
query_iterable.QueryIterable
"""
if feed_options is None:
feed_options = {}
return self.QueryDocuments(collection_link, None, feed_options)
So, you could modify your code as below:
# Initialize the Python DocumentDB client
client = document_client.DocumentClient(config['ENDPOINT'], {'masterKey': config['MASTERKEY']})
db = "db"
coll = "coll"
try:
database_link = 'dbs/' + db
database = client.ReadDatabase(database_link)
collection_link = 'dbs/' + db + "/colls/" + coll
collection = client.ReadCollection(collection_link)
# options = {}
# options['enableCrossPartitionQuery'] = True
# options['partitionKey'] = 'jay'
docs = client.ReadDocuments(collection_link)
print(list(docs))
except errors.DocumentDBError as e:
if e.status_code == 404:
print('A Database with id \'{0}\' does not exist'.format(id))
else:
raise errors.HTTPFailure(e.status_code)
If you want to query partition of your collection, please add the snippet of code which are commented in the above code.
options = {}
options['enableCrossPartitionQuery'] = True
options['partitionKey'] = 'jay'
It seems that your issue is focused on Azure Cosmos DB query performance.
You could refer to the following points to improve query performance.
Partitioning
You could set partition keys in your database and query with a filter clause on a single partition key so that it needs lower latency and consumes lower RUs.
Throughput
You could set the throughput a bit larger so that Azure Cosmos DB performance in unit time will be greatly improved. Of course, this will lead to higher costs.
Indexing Policy
The use of indexing paths can offer improved performance and lower latency.
For more details, I recommend that you refer to the official performance documentation.
Hope it helps you.
I'm a bit new to Python dev -- I'm creating a larger project for some web scraping. I want to approach this as "Pythonically" as possible, and would appreciate some help with the project structure. Here's how I'm doing it now:
Basically, I have a base class for an object whose purpose is to go to a website and parse some specific data on it into its own array, jobs[]
minion.py
class minion:
# Empty getJobs() function to be defined by object pre-instantiation
def getJobs(self):
pass
# Constructor for a minion that requires site authorization
# Ex: minCity1 = minion('http://portal.com/somewhere', 'user', 'password')
# or minCity2 = minion('http://portal.com/somewhere')
def __init__(self, title, URL, user='', password=''):
self.title = title
self.URL = URL
self.user = user
self.password = password
self.jobs = []
if (user == '' and password == ''):
self.reqAuth = 0
else:
self.reqAuth = 1
def displayjobs(self):
for j in self.jobs:
j.display()
I'm going to have about 100 different data sources. The way I'm doing it now is to just create a separate module for each "Minion", which defines (and binds) a more tailored getJobs() function for that object
Example: minCity1.py
from minion import minion
from BeautifulSoup import BeautifulSoup
import urllib2
from job import job
# MINION CONFIG
minTitle = 'Some city'
minURL = 'http://www.somewebpage.gov/'
# Here we define a function that will be bound to this object's getJobs function
def getJobs(self):
page = urllib2.urlopen(self.URL)
soup = BeautifulSoup(page)
# For each row
for tr in soup.findAll('tr'):
tJob = job()
span = tr.findAll(['span', 'class="content"'])
# If row has 5 spans, pull data from span 2 and 3 ( [1] and [2] )
if len(span) == 5:
tJob.title = span[1].a.renderContents()
tJob.client = 'Some City'
tJob.source = minURL
tJob.due = span[2].div.renderContents().replace('<br />', '')
self.jobs.append(tJob)
# Don't forget to bind the function to the object!
minion.getJobs = getJobs
# Instantiate the object
mCity1 = minion(minTitle, minURL)
I also have a separate module which simply contains a list of all the instantiated minion objects (which I have to update each time I add one):
minions.py
from minion_City1 import mCity1
from minion_City2 import mCity2
from minion_City3 import mCity3
from minion_City4 import mCity4
minionList = [mCity1,
mCity2,
mCity3,
mCity4]
main.py references minionList for all of its activities for manipulating the aggregated data.
This seems a bit chaotic to me, and was hoping someone might be able to outline a more Pythonic approach.
Thank you, and sorry for the long post!
Instead of creating functions and assigning them to objects (or whatever minion is, I'm not really sure), you should definitely use classes instead. Then you'll have one class for each of your data sources.
If you want, you can even have these classes inherit from a common base class, but that isn't absolutely necessary.
I've got a Python CGI script that pulls data from a GPS service; I'd like this information to be updated on the webpage about once every 10s (the max allowed by the GPS service's TOS). But there could be, say, 100 users viewing the webpage at once, all calling the script.
I think the users' scripts need to grab data from a buffer page that itself only upates once every ten seconds. How can I make this buffer page auto-update if there's no one directly viewing the content (and not accessing the CGI)? Are there better ways to accomplish this?
Cache the results of your GPS data query in a file or database (sqlite) along with a datetime.
You can then do a datetime check against the last cached datetime to initiate another GPS data query.
You'll probably run into concurrency issues with cgi and the datetime check though...
To get around concurrency issues, you can use sqlite, and put the write in a try/except.
Here's a sample cache implementation using sqlite.
import datetime
import sqlite3
class GpsCache(object):
db_path = 'gps_cache.db'
def __init__(self):
self.con = sqlite3.connect(self.db_path)
self.cur = self.con.cursor()
def _get_period(self, dt=None):
'''normalize time to 15 minute periods'''
if dt.minute < 15:
minute_period = 0
elif 15 <= dt.minute < 30:
minute_period = 15
elif 30 <= dt_minute < 45:
minute_period = 30
elif 45 <= dt_minute:
minute_period = 25
period_dt = datetime.datetime(year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, minute=minute_period)
return period_dt
def get_cache(dt=None):
period_dt = self._get_period(dt)
select_sql = 'SELECT * FROM GPS_CACHE WHERE date_time = "%s";' % period_dt.strftime('%Y-%m-%d %H:%M')
self.cur.execut(select_sql)
result = self.cur.fetchone()[0]
return result
def put_cache(dt=None, data=None):
period_dt = self._get_period(dt)
insert_sql = 'INSERT ....' # edit to your table structure
try:
self.cur.execute(insert_sql)
self.con.commit()
except sqlite3.OperationalError:
# assume db is being updated by another process with the current resutls and ignore
pass
So we have the cache tool now the implementation side.
You'll want to check the cache first then if it's not 'fresh' (doens't return anything), go grab the data using your current method. Then cache the data you grabbed.
you should probably organize this better, but you should get the general idea here.
Using this sample, you just replace your current calls to 'remote_get_gps_data' with 'get_gps_data'.
from gps_cacher import GpsCache
def remote_get_gps_data():
# your function here
return data
def get_gps_data():
data = None
gps_cache = GpsCache()
current_dt = datetime.datetime.now()
cached_data = gps_cache.get_cache(current_dt)
if cached_data:
data = cached_data
else:
data = remote_get_gps_data()
gps_cache.put_cache(current_dt, data)
return data