Every 4 seconds, I have to store 32,000 rows of data. Each of these rows consists of one time stamp value and 464 double precision values. The column name for the time stamp is time and the column name for the precision values increase sequentially as channel1, channel2, ..., and channel 464.
I establish a connection as follows:
CONNECTION = f"postgres://{username}:{password}#{host}:{port}/{dbname}"#?sslmode=require"
self.TimescaleDB_Client = psycopg2.connect(CONNECTION)
I then verify the TimescaleDB extension with the following:
def verifyTimeScaleInstall(self):
try:
sql_query = "CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE;"
cur = self.TimescaleDB_Client.cursor()
cur.execute(sql_query)
cur.close()
self.TimescaleDB_Client.commit()
except:
self.timescaleLogger.error("An error occurred in verifyTimeScaleInstall")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
I then create a hyptertable for my data with the following:
def createRAWDataTable(self):
try:
cur = self.TimescaleDB_Client.cursor()
self.query_create_raw_data_table = None
for channel in range(self.num_channel) :
channel = channel + 1
if self.query_create_raw_data_table is None:
self.query_create_raw_data_table = f"CREATE TABLE IF NOT EXISTS raw_data (time TIMESTAMPTZ NOT NULL, channel{channel} REAL"
else:
self.query_create_raw_data_table = self.query_create_raw_data_table + f", channel{channel} REAL"
self.query_create_raw_data_table = self.query_create_raw_data_table + ");"
self.query_create_raw_data_hypertable = "SELECT create_hypertable('raw_data', 'time');"
cur.execute(self.query_create_raw_data_table)
cur.execute(self.query_create_raw_data_hypertable)
self.TimescaleDB_Client.commit()
cur.close()
except:
self.timescaleLogger.error("An error occurred in createRAWDataTable")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
I then insert the data into the hypertable using the following:
def insertRAWData(self, seconds):
try:
insert_start_time = datetime.now(pytz.timezone("MST"))
current_time = insert_start_time
num_iterations = seconds * self.fs
time_increment = timedelta(seconds=1/self.fs)
raw_data_query = self.query_insert_raw_data
dtype = "float32"
matrix = np.random.rand(self.fs*seconds,self.num_channel).astype(dtype)
cur = self.TimescaleDB_Client.cursor()
data = list()
for iteration in range(num_iterations):
raw_data_row = matrix[iteration,:].tolist() #Select a particular row and all columns
time_string = current_time.strftime("%Y-%m-%d %H:%M:%S.%f %Z")
raw_data_values = (time_string,)+tuple(raw_data_row)
data.append(raw_data_values)
current_time = current_time + time_increment
start_time = time.perf_counter()
psycopg2.extras.execute_values(
cur, raw_data_query, data, template=None, page_size=100
)
print(time.perf_counter() - start_time)
self.TimescaleDB_Client.commit()
cur.close()
except:
self.timescaleLogger.error("An error occurred in insertRAWData")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
The SQL Query String that I am referencing in the above code is obtained from the following:
def getRAWData_Query(self):
try:
self.query_insert_raw_data = None
for channel in range(self.num_channel):
channel = channel + 1
if self.query_insert_raw_data is None:
self.query_insert_raw_data = f"INSERT INTO raw_data (time, channel{channel}"
else:
self.query_insert_raw_data = self.query_insert_raw_data + f", channel{channel}"
self.query_insert_raw_data = self.query_insert_raw_data + ") VALUES %s;"
return self.query_insert_raw_data
except:
self.timescaleLogger.error("An error occurred in insertRAWData_Query")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
As you can see, I am using psycopg2.extras.execute_values() to insert the values. To my understanding, this is one of the fastest ways to insert data. However, it takes about 80 seconds for me to insert this data. It is on quite a beafy system with 12 cores/24 threads, SSDs, and 256GB of RAM. Can this be done faster? It just seems quite slow.
I would like to use TimescaleDB and am evaluating its performance. But I am looking to write within 2 seconds or so for it to be acceptable.
Edit I have tried to use pandas to perform the insert, but it took longer, at about 117 seconds. The following is the function that I used.
def insertRAWData_Pandas(self, seconds):
try:
insert_start_time = datetime.now(pytz.timezone("MST"))
current_time = insert_start_time
num_iterations = seconds * self.fs
time_increment = timedelta(seconds=1/self.fs)
raw_data_query = self.query_insert_raw_data
dtype = "float32"
matrix = np.random.rand(self.fs*seconds,self.num_channel).astype(dtype)
pd_df_dict = {}
pd_df_dict["time"] = list()
for iteration in range(num_iterations):
time_string = current_time.strftime("%Y-%m-%d %H:%M:%S.%f %Z")
pd_df_dict["time"].append(time_string)
current_time = current_time + time_increment
for channel in range(self.num_channel):
pd_df_dict[f"channel{channel}"] = matrix[:,channel].tolist()
start_time = time.perf_counter()
pd_df = pd.DataFrame(pd_df_dict)
pd_df.to_sql('raw_data', self.engine, if_exists='append')
print(time.perf_counter() - start_time)
except:
self.timescaleLogger.error("An error occurred in insertRAWData_Pandas")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
edit I have tried to use CopyManager and it appears to be producing the best results at around 74 seconds. Still not what I was after however.
def insertRAWData_PGCOPY(self, seconds):
try:
insert_start_time = datetime.now(pytz.timezone("MST"))
current_time = insert_start_time
num_iterations = seconds * self.fs
time_increment = timedelta(seconds=1/self.fs)
dtype = "float32"
matrix = np.random.rand(num_iterations,self.num_channel).astype(dtype)
data = list()
for iteration in range(num_iterations):
raw_data_row = matrix[iteration,:].tolist() #Select a particular row and all columns
#time_string = current_time.strftime("%Y-%m-%d %H:%M:%S.%f %Z")
raw_data_values = (current_time,)+tuple(raw_data_row)
data.append(raw_data_values)
current_time = current_time + time_increment
channelList = list()
for channel in range(self.num_channel):
channel = channel + 1
channelString = f"channel{channel}"
channelList.append(channelString)
channelList.insert(0,"time")
cols = tuple(channelList)
start_time = time.perf_counter()
mgr = CopyManager(self.TimescaleDB_Client, 'raw_data', cols)
mgr.copy(data)
self.TimescaleDB_Client.commit()
print(time.perf_counter() - start_time)
except:
self.timescaleLogger.error("An error occurred in insertRAWData_PGCOPY")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
I tried to modify the following values in postgresql.conf. There wasn't a noticeable performance improvement.
wal_level = minimal
fsync = off
synchronous_commit = off
wal_writer_delay = 2000ms
commit_delay = 100000
I have tried to modify the chunk size according to one of the below comments using the following in my createRawDataTable() function. However, there wasn't an improvement in the insert times. Perhaps this was also expectable given that I haven't been accumulating data. The data in the database has only been a few samples, perhaps at most 1 minute worth over the course of my testing.
self.query_create_raw_data_hypertable = "SELECT create_hypertable('raw_data', 'time', chunk_time_interval => INTERVAL '3 day',if_not_exists => TRUE);"
Edit For anyone reading this, I was able to pickle and insert an 32000x464 float32 numpy matrix in about 0.5 seconds for MongoDB, which is what my final solution is. Perhaps MongoDB just does better with this workload in this case.
I have a two initial suggestions that may help with overall performance.
The default hypertable you are creating will "chunk" your data by 7 day periods (this means each chunk will hold around 4,838,400,000 rows of data given your parameters). Since your data is so granular, you may want to use a different chunk size. Check out the docs here for info on the optional chunk_time_interval argument. Changing the chuck size should help with inserting and querying speed, it also will give you better performance in compression if needed later on.
As the individuals above stated, playing around with batch inserts should also help. If you haven't checked out this stock data tutorial I would highly recommend it. Using pgcopy and it's function CopyManager could help with inserting df objects more quickly.
Hopefully, some of this information can be helpful to your situation!
disclosure: I am part of the Timescale team 😊
You can use sqlachemy library to do it and also calibrate the chunksize while you are at it.
Append the data should possibly less than 74 seconds since I perform similar kind of insertion and it takes me about 40 odd seconds.
Another possibility is to use the pandas.DataFrame.to_sql with method=callable. It will increase the performance drastically.
in comparison to just to_sql (150s) or to_sql with method = multi (196s), the callable method did the job in just 14s.
Although a comparative summary for different methods would be best described with the image
One of the fastest ways is to
first create a pandas data frame of your data that you want to insert into the DB
then use the data frame to bulk-insert your data into the DB
here is a way you can do it: How to write data frame to postgres?
Related
I have been trying to make my code faster by running parallel processes with no luck. I am fetching weather data with an external library (https://github.com/pnuu/fmiopendata). Under the hood the library is simply using requests.get() for fetching data from the API. Any tips on how to proceed? I could surely edit the code of fmiopendata, but I would prefer a workaround and not having to refactor others code.
Here is some working code, which I would like to edit:
from fmiopendata.wfs import download_stored_query
def parseStartTime(ts, year):
return str(year) + "-" + ts[0][0] + "-" + ts[0][1] + "T00:00:00Z"
def parseEndTime(ts, year):
return str(year) + "-" + ts[1][0] + "-" + ts[1][1] + "T23:59:59Z"
def weatherWFS(lat, lon, start_time, end_time):
# Downloading the observations form the WFS server. Using bbox and timestams for querying
while True:
try:
obs = download_stored_query(
"fmi::observations::weather::daily::multipointcoverage",
args=["bbox="+str(lon - 1e-2)+","+str(lat - 1e-2)+","+str(lon + 1e-2)+","+str(lat + 1e-2),
"starttime=" + start_time,
"endtime=" + end_time])
if obs.data == {}:
return False
else:
return obs
except:
pass
def getWeatherData(lat, lon):
StartYear, EndYear = 2011, 2021
# Handling the data is suitable chunks. Array pairs represent the starting and ending
# dates of the intervals in ["MM", "dd"] format
intervals = [
[["01", "01"], ["03", "31"]],
[["04", "01"], ["06", "30"]],
[["07", "01"], ["09", "30"]],
[["10", "01"], ["12", "31"]]
]
# Start and end timestamps are saved in an array
queries = [[parseStartTime(intervals[i], year),
parseEndTime(intervals[i], year)]
for year in range(StartYear, EndYear + 1)
for i in range(len(intervals))]
for query in queries:
# This is the request we need to run in parallel processing to save time
# the obs-objects need to be saved somehow and merged afterwards
obs = weatherWFS(lat, lon, query[0], query[1])
""" INSERT MAGIC CODE HERE """
lat, lon = 62.6, 29.72
getWeatherData(lat, lon)
Answering to my self:
The best solution I found so far is to use concurrent.futures with either the map() or submit() functions.
The suggested solution by Trambi does not improve the execution, as the requests are not CPU intensive. The bottleneck here is the waiting time, which the CPU has to stay idle, and therefore using separate processes is not going to solve the problem. However, multithreading can improve the speed, as the threads are created and shut down quicker.
Using the ThreadPoolExecutor with combination with as_completed(), I was able to recude the execution time with ~15%.
from concurrent.futures import ThreadPoolExecutor, as_completed
from fmiopendata.wfs import download_stored_query
def parseStartTime(ts, year):
return str(year) + "-" + ts[0][0] + "-" + ts[0][1] + "T00:00:00Z"
def parseEndTime(ts, year):
return str(year) + "-" + ts[1][0] + "-" + ts[1][1] + "T23:59:59Z"
def weatherWFS(lat, lon, start_time, end_time):
# Downloading the observations form the WFS server. Using bbox and timestams for querying
while True:
try:
obs = download_stored_query(
"fmi::observations::weather::daily::multipointcoverage",
args=["bbox="+str(lon - 1e-2)+","+str(lat - 1e-2)+","+str(lon + 1e-2)+","+str(lat + 1e-2),
"starttime=" + start_time,
"endtime=" + end_time])
if obs.data == {}:
return False
else:
return obs
except:
pass
def getWeatherData(lat, lon):
StartYear, EndYear = 2011, 2021
# Handling the data is suitable chunks. Array pairs represent the starting and ending
# dates of the intervals in ["MM", "dd"] format
intervals = [
[["01", "01"], ["03", "31"]],
[["04", "01"], ["06", "30"]],
[["07", "01"], ["09", "30"]],
[["10", "01"], ["12", "31"]]
]
# Start and end timestamps are saved in an array
queries = [
[lat, lon,
parseStartTime(intervals[i], year),
parseEndTime(intervals[i], year)]
for year in range(StartYear, EndYear)
for i in range(len(intervals))]
observations = [executor.submit(weatherWFS, query) for query in queries]
for obs in as_completed(observations):
obs = obs.result()
"""do stuff with the observations"""
lat, lon = 62.6, 29.72
getWeatherData(lat, lon)
You could try using multiprocessing.Pool.
Replace your for query in queries: loop with something like:
import multiprocessing
iterable = zip([lat]*len(queries), [lon]*len(queries), queries)
pool = multiprocessing.Pool(len(queries))
obs_list = pool.map(func=weatherWFS, iterable=iterable)
pool.close()
pool.join()
Note that this will pass the whole query elements as arguments to weatherWFS so you should change the function signature accordingly:
def weatherWFS(lat, lon, query):
start_time = query[0]
end_time = query[1]
Depending on the length of queries and its element you might also choose to unpack queries in your iterable...
Given a list of data to process and a 64-core CPU (plus 500 GB RAM).
The list should sort strings and store data in a result set of millions of records, which runs just fine, takes a few seconds with multiprocessing.
But I'd also need to store the result somehow, either in a txt, csv output or a database. So far I haven't found a viable solution, because after the first part (process), the insert method either gives an error with trying it with MySQL pooling, or takes an insanely long time giving the txt output.
What Ive tried so far: simple txt output, print out to txt file, using csv, pandas and numpy libs. Nothing seems to speed it up. Any help would be greatly appreciated!
My code right now:
import os
import re
import datetime
import time
import csv
import mysql.connector as connector
from mysql.connector.pooling import MySQLConnectionPool
import mysql
import numpy as np
from tqdm import tqdm
from time import sleep
import multiprocessing as mp
import numpy
pool = MySQLConnectionPool( pool_name="sql_pool",
pool_size=32,
pool_reset_session=True,
host="localhost",
port="3306",
user="homestead",
password="secret",
database="homestead")
# # sql connection
db = mysql.connector.connect(
host="localhost",
port="3306",
user="homestead",
password="secret",
database="homestead"
)
sql_cursor = db.cursor()
delete_statement = "DELETE FROM statistics"
sql_cursor.execute(delete_statement)
db.commit()
sql_statement = "INSERT INTO statistics (name, cnt) VALUES (%s, %s)"
list = []
domains = mp.Manager().list()
unique_list = mp.Manager().list()
invalid_emails = mp.Manager().list()
result = mp.Manager().list()
regex_email = '^(\w|\.|\_|\-)+[#](\w|\_|\-|\.)+[.]\w{2,3}$'
# check email validity
def check(list, email):
if(re.search(regex_email, email)):
domains.append(email.lower().split('#')[1])
return True
else:
invalid_emails.append(email)
return False
#end of check email validity
# execution time converter
def convertTime(seconds):
seconds = seconds % (24 * 3600)
hour = seconds // 3600
seconds %= 3600
minutes = seconds // 60
seconds %= 60
if(hour == 0):
if(minutes == 0):
return "{0} sec".format(seconds)
else:
return "{0}min {1}sec".format(minutes, seconds)
else:
return "{0}hr {1}min {2}sec".format(hour, minutes, seconds)
# execution time converter end
#process
def process(list):
for item in tqdm(list):
if(check(list, item)):
item = item.lower().split('#')[1]
if item not in unique_list:
unique_list.append(item)
# end of process
def insert(list):
global sql_statement
# Add to db
con = pool.get_connection()
cur = con.cursor()
print("PID %d: using connection %s" % (os.getpid(), con))
#cur.executemany(sql_statement, sorted(map(set_result, list)))
for item in list:
cur.execute(sql_statement, (item, domains.count(item)))
con.commit()
cur.close()
con.close()
# def insert_into_database(list):
#sql_cursor.execute(sql_statement, (unique_list, 1), multi=True)
# sql_cursor.executemany(sql_statement, sorted(map(set_result, list)))
# db.commit()
# statistics
def statistics(list):
for item in tqdm(list):
if(domains.count(item) > 0):
result.append([domains.count(item), item])
# end of statistics
params = sys.argv
filename = ''
process_count = -1
for i, item in enumerate(params):
if(item.endswith('.txt')):
filename = item
if(item == '--top'):
process_count = int(params[i+1])
def set_result(item):
return item, domains.count(item)
# main
if(filename):
try:
start_time = time.time()
now = datetime.datetime.now()
dirname = "email_stats_{0}".format(now.strftime("%Y%m%d_%H%M%S"))
os.mkdir(dirname)
list = open(filename).read().split()
if(process_count == -1):
process_count = len(list)
if(process_count > 0):
list = list[:process_count]
#chunking list
n = int(len(list) / mp.cpu_count())
chunks = [list[i:i + n] for i in range(0, len(list), n)]
processes = []
print('Processing list on {0} cores...'.format(mp.cpu_count()))
for chunk in chunks:
p = mp.Process(target=process, args=[chunk])
p.start()
processes.append(p)
for p in processes:
p.join()
# insert(unique_list)
## step 2 - write sql
## Clearing out db before new data insert
con = pool.get_connection()
cur = con.cursor()
delete_statement = "DELETE FROM statistics"
cur.execute(delete_statement)
u_processes = []
#Maximum pool size for sql is 32, so maximum chunk number should be that too.
if(mp.cpu_count() < 32):
n2 = int(len(unique_list) / mp.cpu_count())
else:
n2 = int(len(unique_list) / 32)
u_chunks = [unique_list[i:i + n2] for i in range(0, len(unique_list), n2)]
for u_chunk in u_chunks:
p = mp.Process(target=insert, args=[u_chunk])
p.start()
u_processes.append(p)
for p in u_processes:
p.join()
for p in u_processes:
p.close()
# sql_cursor.executemany(sql_statement, sorted(map(set_result, unique_list)))
# db.commit()
# for item in tqdm(unique_list):
# sql_val = (item, domains.count(item))
# sql_cursor.execute(sql_statement, sql_val)
#
# db.commit()
## numpy.savetxt('saved.txt', sorted(map(set_result, unique_list)), fmt='%s')
# with(mp.Pool(mp.cpu_count(), initializer = db) as Pool:
# Pool.map_async(insert_into_database(),set(unique_list))
# Pool.close()
# Pool.join()
print('Creating statistics for {0} individual domains...'.format(len(unique_list)))
# unique_list = set(unique_list)
# with open("{0}/result.txt".format(dirname), "w+") as f:
# csv.writer(f).writerows(sorted(map(set_result, unique_list), reverse=True))
print('Writing final statistics...')
print('OK.')
f = open("{0}/stat.txt".format(dirname),"w+")
f.write("Number of processed emails: {0}\r\n".format(process_count))
f.write("Number of valid emails: {0}\r\n".format(len(list) - len(invalid_emails)))
f.write("Number of invalid emails: {0}\r\n".format(len(invalid_emails)))
f.write("Execution time: {0}".format(convertTime(int(time.time() - start_time))))
f.close()
except FileNotFoundError:
print('File not found, path or file broken.')
else:
print('Wrong file format, should be a txt file.')
# main
See my comments regarding some changes you might wish to make, one of which might improve performance. But I think one area of performance which could really be improved is in your use of managed lists. These are represented by proxies and each operation on such a list is essentially a remote procedure call and thus very slow. You cannot avoid this given that you need to have multiple processes updating a common, shared lists (or dict if you take my suggestion). But in the main process you might be trying, for example, to construct a set from a shared list as follows:
Pool.map_async(insert_into_database(),set(unique_list))
(by the way, that should be Pool.map(insert_into_database, set(unique_list)), i.e. you have an extra set of () and you can then get rid of the calls to pool.close() and pool.join() if you wish)
The problem is that you are iterating every element of unique_list through a proxy, which might be what is taking a very long time. I say "might" because I would think the use of managed lists would prevent the code as is, i.e. without outputting the results, from completing in "a few seconds" if we are talking about "millions" of records and thus millions of remote procedure calls. But this number could certainly be reduced if you could somehow get the underlying list as a native list.
First, you need to heed my comment about having declared a variable named list thus making it impossible to create native lists or subclasses of list. Once your have renamed that variable to something more reasonable, we can create our own managed class MyList that will expose the underlying list on which it is built. Note that you can do the same thing with a MyDict class that subclasses dict. I have defined both classes for you. Here is a benchmark showing the difference between constructing a native list from a managed list versus creating a native list from a MyList:
import multiprocessing as mp
from multiprocessing.managers import BaseManager
import time
class MyManager(BaseManager):
pass
class MyList(list):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_underlying_list(self):
return self
class MyDict(dict):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_underlying_dict(self):
return self
# required for windows, which I am running on:
if __name__ == '__main__':
l = mp.Manager().list()
for i in range(100_000):
l.append(i)
t = time.time()
l2 = list(l)
print(time.time() - t, l2[0:5], l2[-5:])
MyManager.register('MyList', MyList)
MyManager.register('MyDict', MyDict)
my_manager = MyManager()
# must explicitly start the manager or use: with MyManager() as manager:
my_manager.start()
l = my_manager.MyList()
for i in range(100_000):
l.append(i)
t = time.time()
l2 = list(l.get_underlying_list())
print(time.time() - t, l2[0:5], l2[-5:])
Prints:
7.3949973583221436 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]
0.007997751235961914 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]
I was performing a NoSQL performance benchmark for a client. I was wondering if my Aerospike Python code is optimal? I'm trying to record query time and load time. The data has 500,000 rows and 8 columns. My code is below.
def test_db():
config = {
'hosts': [ ('127.0.0.1', 3000) ]
}
client = aerospike.client(config).connect()
t0 = time.time()
global rec
rec = {}
with open('skunkworks.csv', 'r') as f:
reader = csv.reader(f)
rownum = 0
for row in reader:
# Save First Row with headers
if rownum == 0:
header = row
else:
colnum = 0
for col in row:
rec[header[colnum]] = col
colnum += 1
rownum += 1
if rec:
client.put(('test', 'demo', str(rownum)), rec)
rec = {}
t1 = time.time()
load_time = t1 - t0
t2 = time.time()
for i in range(2,500002):
(key, metadata, record) = client.get(('test', 'demo', str(i)))
# print(record)
t3 = time.time()
read_time = t3 - t2
return [load_time , read_time]
Is your Python application going to run as a single process, or will it be a multi-process approach, such as fastCGI?
If you're trying to benchmark, make sure it's simulating how your application will run. To write lots of rows, then read lots of rows, all from a single process, doesn't usually simulate anything realistic. Perhaps in your case it does, but if not, make your sample code match the real access pattern.
Also, you should deploy your benchmark in a similar way to the application. Don't run benchmarks on the same machine as the server nodes, if that's not how it'll be in production.
I am reading a bunch of daily files and using glob to concatenate them all together into separate dataframes.I eventually join them together and basically create a single large file which I use to connect to a dashboard. I am not too familiar with Python but I used pandas and sklearn often.
As you can see, I am basically just reading the last 60 (or more) days worth of data (last 60 files) and creating a dataframe for each. This works, but I am wondering if there is a more pythonic/better way? I watched a video on pydata (about not being restricted by PEP 8 and making sure your code is pythonic) which was interesting.
(FYI - the reason why I need to read 60 days worth of time is because customers can fill out a survey from a call which happened a long time ago. The customer fills out a survey today about a call that happened in July. I need to know about that call (how long it lasted, what the topic was, etc).
os.chdir(r'C:\\Users\Documents\FTP\\')
loc = r'C:\\Users\Documents\\'
rosterloc = r'\\mand\\'
splitsname = r'Splits.csv'
fcrname = r'global_disp_'
npsname = r'survey_'
ahtname = r'callbycall_'
rostername = 'Daily_Roster.csv'
vasname = r'vas_report_'
ext ='.csv'
startdate = dt.date.today() - Timedelta('60 day')
enddate = dt.date.today()
daterange = Timestamp(enddate) - Timestamp(startdate)
daterange = (daterange / np.timedelta64(1, 'D')).astype(int)
data = []
frames = []
calls = []
bracket = []
try:
for date_range in (Timestamp(startdate) + dt.timedelta(n) for n in range(daterange)):
aht = pd.read_csv(ahtname+date_range.strftime('%Y_%m_%d')+ext)
calls.append(aht)
except IOError:
print('File does not exist:', ahtname+date_range.strftime('%Y_%m_%d')+ext)
aht = pd.concat(calls)
print('AHT Done')
try:
for date_range in (Timestamp(startdate) + dt.timedelta(n) for n in range(daterange)):
fcr = pd.read_csv(fcrname+date_range.strftime('%m_%d_%Y')+ext, parse_dates = ['call_time'])
data.append(fcr)
except IOError:
print('File does not exist:', fcrname+date_range.strftime('%m_%d_%Y')+ext)
fcr = pd.concat(data)
print('FCR Done')
try:
for date_range in (Timestamp(enddate) - dt.timedelta(n) for n in range(3)):
nps = pd.read_csv(npsname+date_range.strftime('%m_%d_%Y')+ext, parse_dates = ['call_date','date_completed'])
frames.append(nps)
except IOError:
print('File does not exist:', npsname+date_range.strftime('%m_%d_%Y')+ext)
nps = pd.concat(frames)
print('NPS Done')
try:
for date_range in (Timestamp(startdate) + dt.timedelta(n) for n in range(daterange)):
vas = pd.read_csv(vasname+date_range.strftime('%m_%d_%Y')+ext, parse_dates = ['Call_date'])
bracket.append(vas)
except IOError:
print('File does not exist:', vasname+date_range.strftime('%m_%d_%Y')+ext)
vas = pd.concat(bracket)
print('VAS Done')
roster = pd.read_csv(loc+rostername)
print('Roster Done')
splits = pd.read_csv(loc+splitsname)
print('Splits Done')
I didn't change names, but IMHO they should be more verbose eg. pd == panda? Not sure. Here is some more pythonic way to write it:
from functools import partial
import logging
from operator import add, sub
import os
import datetime as dt
import contextlib
os.chdir(r'C:\\Users\Documents\FTP\\')
location = r'C:\\Users\Documents\\'
roster_location = r'\\mand\\'
splits_name = r'Splits.csv'
fcr_name = r'global_disp_'
nps_name = r'survey_'
aht_name = r'callbycall_'
roster_name = 'Daily_Roster.csv'
vas_name = r'vas_report_'
ext = '.csv'
start_date = dt.date.today() - Timedelta('60 day')
end_date = dt.date.today()
daterange = Timestamp(end_date) - Timestamp(start_date)
daterange = (daterange / np.timedelta64(1, 'D')).astype(int)
logger = logging.getLogger() # logger is better than "print" in case, when you have multiple tiers to log. In this case: regular debug and exceptions
def timestamps_in_range(daterange, method=add): # injected operation method instead of "if" statement in case of subtracting
for n in xrange(daterange):
yield method(Timestamp(start_date), dt.timedelta(n)) # use generators for creating series of data in place
def read_csv(name, date_range, **kwargs): # use functions/methods to shorten (make more readable) long, repetitive method invocation
return pd.read_csv(name + date_range.strftime('%Y_%m_%d') + ext, kwargs)
def log_done(module): # use functions/methods to shorten (make more readable) long, repetitive method invocation
logger.debug("%s Done" % module)
#contextlib.contextmanager #contextmanager is great to separate business logic from exception handling
def mapper(function, iterable):
try:
yield map(function, iterable) # map instead of executing function in "for" loop
except IOError, err:
logger.error('File does not exist: ', err.filename)
# Following code is visualy tight and cleaner.
# Shows only what's needed, hiding most insignificant details and repetitive code
read_csv_aht = partial(read_csv, aht_name) # partial pre-fills function (first argument) with arguments of this function (remaining arguments). In this case it is useful for feeding "map" function - it takes one-argument function to execute on each element of a list
with mapper(read_csv_aht, timestamps_in_range(daterange)) as calls: # contextmanager beautifully hides "dangerous" content, sharing only the "safe" result to be used
aht = pd.concat(calls)
log_done('AHT')
read_csv_fcr = partial(read_csv, fcr_name)
with mapper(read_csv_fcr, timestamps_in_range(daterange)) as data:
fcr = pd.concat(data)
log_done('FCR')
read_csv_nps = partial(read_csv, nps_name, parse_dates=['call_date', 'date_completed'])
with mapper(read_csv_nps, timestamps_in_range(3, sub)) as frames:
nps = pd.concat(frames)
log_done('NPS')
read_csv_vas = partial(read_csv, vas_name, parse_dates=['Call_date'])
with mapper(read_csv_vas, timestamps_in_range(daterange)) as bracket:
vas = pd.concat(bracket)
log_done('VAS')
roster = pd.read_csv(location + roster_name)
log_done('Roster')
splits = pd.read_csv(location + splits_name)
log_done('Splits')
There are a lot of threads with questions similar to mine, but I can't find the correct answer.
My goal is to create an if statement that compares the time now to a schedule (the schedule defined by a start time and end time).
I have it working if I put specific numbers in for the schedule, but I need to pass variables into my statement seeing that the schedule is not going to be static.
What I have so far:
import time as sleeptime
from datetime import datetime, time
schedule_list = []
def scheduler():
conn = pymysql.connect(host='myhost', db='test', user='1234', passwd='123456', autocommit = True)
cur = conn.cursor()
query = ("SELECT startTimehour, startTimeminute, endTimehour, endTimeminute FROM schedule WHERE hwID = %s")
print query
cur.execute(query, (number))
for row in cur:
print row
door_schedule_list.append(row)
cur.close()
conn.close()
if len(door_schedule_list) > 0:
start_time_hour = door_schedule_list[0][0]
start_time_minute = door_schedule_list[0][1]
end_time_hour = door_schedule_list[0][2]
end_time_minute = door_schedule_list[0][3]
print start_time_hour
print start_time_minute
print end_time_hour
print end_time_minute
while True:
now = datetime.now()
#print (door_schedule_list)
#starttime = time(startTimehour, startTimeminute)
if time("%d","%d") <= now.time() <= time("%d","%d") % (start_time_hour, start_time_minute, end_time_hour, end_time_minute):
print "Schedule active"
sleeptime.sleep(20)
else:
print "Schedule Inactive"
sleeptime.sleep(20)
If there is an easier way to accomplish my goal, please let me know. Otherwise how can I fix the error.
When you do:
if time("%d","%d") <= now.time() <= time("%d","%d") % (start_time_hour, start_time_minute, end_time_hour, end_time_minute)
the % operator its not doing what you think it should do.
Its not the same:
time("%d" % 5, "%d" % 4)
than
time("%d", "%d") % (5, 4)
the last line will complains with the error: an integer is required.
Also you can:
hours = 5
min = 4
time(hours, min)
So you can change the line to:
if time(start_time_hour, start_time_minute) <= now.time() <= time(end_time_hour, end_time_minute)