I am reading a bunch of daily files and using glob to concatenate them all together into separate dataframes.I eventually join them together and basically create a single large file which I use to connect to a dashboard. I am not too familiar with Python but I used pandas and sklearn often.
As you can see, I am basically just reading the last 60 (or more) days worth of data (last 60 files) and creating a dataframe for each. This works, but I am wondering if there is a more pythonic/better way? I watched a video on pydata (about not being restricted by PEP 8 and making sure your code is pythonic) which was interesting.
(FYI - the reason why I need to read 60 days worth of time is because customers can fill out a survey from a call which happened a long time ago. The customer fills out a survey today about a call that happened in July. I need to know about that call (how long it lasted, what the topic was, etc).
os.chdir(r'C:\\Users\Documents\FTP\\')
loc = r'C:\\Users\Documents\\'
rosterloc = r'\\mand\\'
splitsname = r'Splits.csv'
fcrname = r'global_disp_'
npsname = r'survey_'
ahtname = r'callbycall_'
rostername = 'Daily_Roster.csv'
vasname = r'vas_report_'
ext ='.csv'
startdate = dt.date.today() - Timedelta('60 day')
enddate = dt.date.today()
daterange = Timestamp(enddate) - Timestamp(startdate)
daterange = (daterange / np.timedelta64(1, 'D')).astype(int)
data = []
frames = []
calls = []
bracket = []
try:
for date_range in (Timestamp(startdate) + dt.timedelta(n) for n in range(daterange)):
aht = pd.read_csv(ahtname+date_range.strftime('%Y_%m_%d')+ext)
calls.append(aht)
except IOError:
print('File does not exist:', ahtname+date_range.strftime('%Y_%m_%d')+ext)
aht = pd.concat(calls)
print('AHT Done')
try:
for date_range in (Timestamp(startdate) + dt.timedelta(n) for n in range(daterange)):
fcr = pd.read_csv(fcrname+date_range.strftime('%m_%d_%Y')+ext, parse_dates = ['call_time'])
data.append(fcr)
except IOError:
print('File does not exist:', fcrname+date_range.strftime('%m_%d_%Y')+ext)
fcr = pd.concat(data)
print('FCR Done')
try:
for date_range in (Timestamp(enddate) - dt.timedelta(n) for n in range(3)):
nps = pd.read_csv(npsname+date_range.strftime('%m_%d_%Y')+ext, parse_dates = ['call_date','date_completed'])
frames.append(nps)
except IOError:
print('File does not exist:', npsname+date_range.strftime('%m_%d_%Y')+ext)
nps = pd.concat(frames)
print('NPS Done')
try:
for date_range in (Timestamp(startdate) + dt.timedelta(n) for n in range(daterange)):
vas = pd.read_csv(vasname+date_range.strftime('%m_%d_%Y')+ext, parse_dates = ['Call_date'])
bracket.append(vas)
except IOError:
print('File does not exist:', vasname+date_range.strftime('%m_%d_%Y')+ext)
vas = pd.concat(bracket)
print('VAS Done')
roster = pd.read_csv(loc+rostername)
print('Roster Done')
splits = pd.read_csv(loc+splitsname)
print('Splits Done')
I didn't change names, but IMHO they should be more verbose eg. pd == panda? Not sure. Here is some more pythonic way to write it:
from functools import partial
import logging
from operator import add, sub
import os
import datetime as dt
import contextlib
os.chdir(r'C:\\Users\Documents\FTP\\')
location = r'C:\\Users\Documents\\'
roster_location = r'\\mand\\'
splits_name = r'Splits.csv'
fcr_name = r'global_disp_'
nps_name = r'survey_'
aht_name = r'callbycall_'
roster_name = 'Daily_Roster.csv'
vas_name = r'vas_report_'
ext = '.csv'
start_date = dt.date.today() - Timedelta('60 day')
end_date = dt.date.today()
daterange = Timestamp(end_date) - Timestamp(start_date)
daterange = (daterange / np.timedelta64(1, 'D')).astype(int)
logger = logging.getLogger() # logger is better than "print" in case, when you have multiple tiers to log. In this case: regular debug and exceptions
def timestamps_in_range(daterange, method=add): # injected operation method instead of "if" statement in case of subtracting
for n in xrange(daterange):
yield method(Timestamp(start_date), dt.timedelta(n)) # use generators for creating series of data in place
def read_csv(name, date_range, **kwargs): # use functions/methods to shorten (make more readable) long, repetitive method invocation
return pd.read_csv(name + date_range.strftime('%Y_%m_%d') + ext, kwargs)
def log_done(module): # use functions/methods to shorten (make more readable) long, repetitive method invocation
logger.debug("%s Done" % module)
#contextlib.contextmanager #contextmanager is great to separate business logic from exception handling
def mapper(function, iterable):
try:
yield map(function, iterable) # map instead of executing function in "for" loop
except IOError, err:
logger.error('File does not exist: ', err.filename)
# Following code is visualy tight and cleaner.
# Shows only what's needed, hiding most insignificant details and repetitive code
read_csv_aht = partial(read_csv, aht_name) # partial pre-fills function (first argument) with arguments of this function (remaining arguments). In this case it is useful for feeding "map" function - it takes one-argument function to execute on each element of a list
with mapper(read_csv_aht, timestamps_in_range(daterange)) as calls: # contextmanager beautifully hides "dangerous" content, sharing only the "safe" result to be used
aht = pd.concat(calls)
log_done('AHT')
read_csv_fcr = partial(read_csv, fcr_name)
with mapper(read_csv_fcr, timestamps_in_range(daterange)) as data:
fcr = pd.concat(data)
log_done('FCR')
read_csv_nps = partial(read_csv, nps_name, parse_dates=['call_date', 'date_completed'])
with mapper(read_csv_nps, timestamps_in_range(3, sub)) as frames:
nps = pd.concat(frames)
log_done('NPS')
read_csv_vas = partial(read_csv, vas_name, parse_dates=['Call_date'])
with mapper(read_csv_vas, timestamps_in_range(daterange)) as bracket:
vas = pd.concat(bracket)
log_done('VAS')
roster = pd.read_csv(location + roster_name)
log_done('Roster')
splits = pd.read_csv(location + splits_name)
log_done('Splits')
Related
I have been trying to make my code faster by running parallel processes with no luck. I am fetching weather data with an external library (https://github.com/pnuu/fmiopendata). Under the hood the library is simply using requests.get() for fetching data from the API. Any tips on how to proceed? I could surely edit the code of fmiopendata, but I would prefer a workaround and not having to refactor others code.
Here is some working code, which I would like to edit:
from fmiopendata.wfs import download_stored_query
def parseStartTime(ts, year):
return str(year) + "-" + ts[0][0] + "-" + ts[0][1] + "T00:00:00Z"
def parseEndTime(ts, year):
return str(year) + "-" + ts[1][0] + "-" + ts[1][1] + "T23:59:59Z"
def weatherWFS(lat, lon, start_time, end_time):
# Downloading the observations form the WFS server. Using bbox and timestams for querying
while True:
try:
obs = download_stored_query(
"fmi::observations::weather::daily::multipointcoverage",
args=["bbox="+str(lon - 1e-2)+","+str(lat - 1e-2)+","+str(lon + 1e-2)+","+str(lat + 1e-2),
"starttime=" + start_time,
"endtime=" + end_time])
if obs.data == {}:
return False
else:
return obs
except:
pass
def getWeatherData(lat, lon):
StartYear, EndYear = 2011, 2021
# Handling the data is suitable chunks. Array pairs represent the starting and ending
# dates of the intervals in ["MM", "dd"] format
intervals = [
[["01", "01"], ["03", "31"]],
[["04", "01"], ["06", "30"]],
[["07", "01"], ["09", "30"]],
[["10", "01"], ["12", "31"]]
]
# Start and end timestamps are saved in an array
queries = [[parseStartTime(intervals[i], year),
parseEndTime(intervals[i], year)]
for year in range(StartYear, EndYear + 1)
for i in range(len(intervals))]
for query in queries:
# This is the request we need to run in parallel processing to save time
# the obs-objects need to be saved somehow and merged afterwards
obs = weatherWFS(lat, lon, query[0], query[1])
""" INSERT MAGIC CODE HERE """
lat, lon = 62.6, 29.72
getWeatherData(lat, lon)
Answering to my self:
The best solution I found so far is to use concurrent.futures with either the map() or submit() functions.
The suggested solution by Trambi does not improve the execution, as the requests are not CPU intensive. The bottleneck here is the waiting time, which the CPU has to stay idle, and therefore using separate processes is not going to solve the problem. However, multithreading can improve the speed, as the threads are created and shut down quicker.
Using the ThreadPoolExecutor with combination with as_completed(), I was able to recude the execution time with ~15%.
from concurrent.futures import ThreadPoolExecutor, as_completed
from fmiopendata.wfs import download_stored_query
def parseStartTime(ts, year):
return str(year) + "-" + ts[0][0] + "-" + ts[0][1] + "T00:00:00Z"
def parseEndTime(ts, year):
return str(year) + "-" + ts[1][0] + "-" + ts[1][1] + "T23:59:59Z"
def weatherWFS(lat, lon, start_time, end_time):
# Downloading the observations form the WFS server. Using bbox and timestams for querying
while True:
try:
obs = download_stored_query(
"fmi::observations::weather::daily::multipointcoverage",
args=["bbox="+str(lon - 1e-2)+","+str(lat - 1e-2)+","+str(lon + 1e-2)+","+str(lat + 1e-2),
"starttime=" + start_time,
"endtime=" + end_time])
if obs.data == {}:
return False
else:
return obs
except:
pass
def getWeatherData(lat, lon):
StartYear, EndYear = 2011, 2021
# Handling the data is suitable chunks. Array pairs represent the starting and ending
# dates of the intervals in ["MM", "dd"] format
intervals = [
[["01", "01"], ["03", "31"]],
[["04", "01"], ["06", "30"]],
[["07", "01"], ["09", "30"]],
[["10", "01"], ["12", "31"]]
]
# Start and end timestamps are saved in an array
queries = [
[lat, lon,
parseStartTime(intervals[i], year),
parseEndTime(intervals[i], year)]
for year in range(StartYear, EndYear)
for i in range(len(intervals))]
observations = [executor.submit(weatherWFS, query) for query in queries]
for obs in as_completed(observations):
obs = obs.result()
"""do stuff with the observations"""
lat, lon = 62.6, 29.72
getWeatherData(lat, lon)
You could try using multiprocessing.Pool.
Replace your for query in queries: loop with something like:
import multiprocessing
iterable = zip([lat]*len(queries), [lon]*len(queries), queries)
pool = multiprocessing.Pool(len(queries))
obs_list = pool.map(func=weatherWFS, iterable=iterable)
pool.close()
pool.join()
Note that this will pass the whole query elements as arguments to weatherWFS so you should change the function signature accordingly:
def weatherWFS(lat, lon, query):
start_time = query[0]
end_time = query[1]
Depending on the length of queries and its element you might also choose to unpack queries in your iterable...
Given a list of data to process and a 64-core CPU (plus 500 GB RAM).
The list should sort strings and store data in a result set of millions of records, which runs just fine, takes a few seconds with multiprocessing.
But I'd also need to store the result somehow, either in a txt, csv output or a database. So far I haven't found a viable solution, because after the first part (process), the insert method either gives an error with trying it with MySQL pooling, or takes an insanely long time giving the txt output.
What Ive tried so far: simple txt output, print out to txt file, using csv, pandas and numpy libs. Nothing seems to speed it up. Any help would be greatly appreciated!
My code right now:
import os
import re
import datetime
import time
import csv
import mysql.connector as connector
from mysql.connector.pooling import MySQLConnectionPool
import mysql
import numpy as np
from tqdm import tqdm
from time import sleep
import multiprocessing as mp
import numpy
pool = MySQLConnectionPool( pool_name="sql_pool",
pool_size=32,
pool_reset_session=True,
host="localhost",
port="3306",
user="homestead",
password="secret",
database="homestead")
# # sql connection
db = mysql.connector.connect(
host="localhost",
port="3306",
user="homestead",
password="secret",
database="homestead"
)
sql_cursor = db.cursor()
delete_statement = "DELETE FROM statistics"
sql_cursor.execute(delete_statement)
db.commit()
sql_statement = "INSERT INTO statistics (name, cnt) VALUES (%s, %s)"
list = []
domains = mp.Manager().list()
unique_list = mp.Manager().list()
invalid_emails = mp.Manager().list()
result = mp.Manager().list()
regex_email = '^(\w|\.|\_|\-)+[#](\w|\_|\-|\.)+[.]\w{2,3}$'
# check email validity
def check(list, email):
if(re.search(regex_email, email)):
domains.append(email.lower().split('#')[1])
return True
else:
invalid_emails.append(email)
return False
#end of check email validity
# execution time converter
def convertTime(seconds):
seconds = seconds % (24 * 3600)
hour = seconds // 3600
seconds %= 3600
minutes = seconds // 60
seconds %= 60
if(hour == 0):
if(minutes == 0):
return "{0} sec".format(seconds)
else:
return "{0}min {1}sec".format(minutes, seconds)
else:
return "{0}hr {1}min {2}sec".format(hour, minutes, seconds)
# execution time converter end
#process
def process(list):
for item in tqdm(list):
if(check(list, item)):
item = item.lower().split('#')[1]
if item not in unique_list:
unique_list.append(item)
# end of process
def insert(list):
global sql_statement
# Add to db
con = pool.get_connection()
cur = con.cursor()
print("PID %d: using connection %s" % (os.getpid(), con))
#cur.executemany(sql_statement, sorted(map(set_result, list)))
for item in list:
cur.execute(sql_statement, (item, domains.count(item)))
con.commit()
cur.close()
con.close()
# def insert_into_database(list):
#sql_cursor.execute(sql_statement, (unique_list, 1), multi=True)
# sql_cursor.executemany(sql_statement, sorted(map(set_result, list)))
# db.commit()
# statistics
def statistics(list):
for item in tqdm(list):
if(domains.count(item) > 0):
result.append([domains.count(item), item])
# end of statistics
params = sys.argv
filename = ''
process_count = -1
for i, item in enumerate(params):
if(item.endswith('.txt')):
filename = item
if(item == '--top'):
process_count = int(params[i+1])
def set_result(item):
return item, domains.count(item)
# main
if(filename):
try:
start_time = time.time()
now = datetime.datetime.now()
dirname = "email_stats_{0}".format(now.strftime("%Y%m%d_%H%M%S"))
os.mkdir(dirname)
list = open(filename).read().split()
if(process_count == -1):
process_count = len(list)
if(process_count > 0):
list = list[:process_count]
#chunking list
n = int(len(list) / mp.cpu_count())
chunks = [list[i:i + n] for i in range(0, len(list), n)]
processes = []
print('Processing list on {0} cores...'.format(mp.cpu_count()))
for chunk in chunks:
p = mp.Process(target=process, args=[chunk])
p.start()
processes.append(p)
for p in processes:
p.join()
# insert(unique_list)
## step 2 - write sql
## Clearing out db before new data insert
con = pool.get_connection()
cur = con.cursor()
delete_statement = "DELETE FROM statistics"
cur.execute(delete_statement)
u_processes = []
#Maximum pool size for sql is 32, so maximum chunk number should be that too.
if(mp.cpu_count() < 32):
n2 = int(len(unique_list) / mp.cpu_count())
else:
n2 = int(len(unique_list) / 32)
u_chunks = [unique_list[i:i + n2] for i in range(0, len(unique_list), n2)]
for u_chunk in u_chunks:
p = mp.Process(target=insert, args=[u_chunk])
p.start()
u_processes.append(p)
for p in u_processes:
p.join()
for p in u_processes:
p.close()
# sql_cursor.executemany(sql_statement, sorted(map(set_result, unique_list)))
# db.commit()
# for item in tqdm(unique_list):
# sql_val = (item, domains.count(item))
# sql_cursor.execute(sql_statement, sql_val)
#
# db.commit()
## numpy.savetxt('saved.txt', sorted(map(set_result, unique_list)), fmt='%s')
# with(mp.Pool(mp.cpu_count(), initializer = db) as Pool:
# Pool.map_async(insert_into_database(),set(unique_list))
# Pool.close()
# Pool.join()
print('Creating statistics for {0} individual domains...'.format(len(unique_list)))
# unique_list = set(unique_list)
# with open("{0}/result.txt".format(dirname), "w+") as f:
# csv.writer(f).writerows(sorted(map(set_result, unique_list), reverse=True))
print('Writing final statistics...')
print('OK.')
f = open("{0}/stat.txt".format(dirname),"w+")
f.write("Number of processed emails: {0}\r\n".format(process_count))
f.write("Number of valid emails: {0}\r\n".format(len(list) - len(invalid_emails)))
f.write("Number of invalid emails: {0}\r\n".format(len(invalid_emails)))
f.write("Execution time: {0}".format(convertTime(int(time.time() - start_time))))
f.close()
except FileNotFoundError:
print('File not found, path or file broken.')
else:
print('Wrong file format, should be a txt file.')
# main
See my comments regarding some changes you might wish to make, one of which might improve performance. But I think one area of performance which could really be improved is in your use of managed lists. These are represented by proxies and each operation on such a list is essentially a remote procedure call and thus very slow. You cannot avoid this given that you need to have multiple processes updating a common, shared lists (or dict if you take my suggestion). But in the main process you might be trying, for example, to construct a set from a shared list as follows:
Pool.map_async(insert_into_database(),set(unique_list))
(by the way, that should be Pool.map(insert_into_database, set(unique_list)), i.e. you have an extra set of () and you can then get rid of the calls to pool.close() and pool.join() if you wish)
The problem is that you are iterating every element of unique_list through a proxy, which might be what is taking a very long time. I say "might" because I would think the use of managed lists would prevent the code as is, i.e. without outputting the results, from completing in "a few seconds" if we are talking about "millions" of records and thus millions of remote procedure calls. But this number could certainly be reduced if you could somehow get the underlying list as a native list.
First, you need to heed my comment about having declared a variable named list thus making it impossible to create native lists or subclasses of list. Once your have renamed that variable to something more reasonable, we can create our own managed class MyList that will expose the underlying list on which it is built. Note that you can do the same thing with a MyDict class that subclasses dict. I have defined both classes for you. Here is a benchmark showing the difference between constructing a native list from a managed list versus creating a native list from a MyList:
import multiprocessing as mp
from multiprocessing.managers import BaseManager
import time
class MyManager(BaseManager):
pass
class MyList(list):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_underlying_list(self):
return self
class MyDict(dict):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_underlying_dict(self):
return self
# required for windows, which I am running on:
if __name__ == '__main__':
l = mp.Manager().list()
for i in range(100_000):
l.append(i)
t = time.time()
l2 = list(l)
print(time.time() - t, l2[0:5], l2[-5:])
MyManager.register('MyList', MyList)
MyManager.register('MyDict', MyDict)
my_manager = MyManager()
# must explicitly start the manager or use: with MyManager() as manager:
my_manager.start()
l = my_manager.MyList()
for i in range(100_000):
l.append(i)
t = time.time()
l2 = list(l.get_underlying_list())
print(time.time() - t, l2[0:5], l2[-5:])
Prints:
7.3949973583221436 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]
0.007997751235961914 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]
I store QuertyText within a pandas dataframe. Once I've loaded all the queries into I want to conduct an analysis again each query. Currently, I have ~50k to evaluate. So, doing it one by one, will take a long time.
So, I wanted to implement concurrent.futures. How do I take the individual QueryText stored within fullAnalysis as pass it to concurrent.futures and return the output as a variable?
Here is my entire code:
import pandas as pd
import time
import gensim
import sys
import warnings
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed
fullAnalysis = pd.DataFrame()
def fetch_data(jFile = 'ProcessingDetails.json'):
print("Fetching data...please wait")
#read JSON file for latest dictionary file name
baselineDictionaryFileName = 'Dictionary/Dictionary_05-03-2020.json'
#copy data to pandas dataframe
labelled_data = pd.read_json(baselineDictionaryFileName)
#Add two more columns to get the most similar text and score
labelled_data['SimilarText'] = ''
labelled_data['SimilarityScore'] = float()
print("Data fetched from " + baselineDictionaryFileName + " and there are " + str(labelled_data.shape[0]) + " rows to be evalauted")
return labelled_data
def calculateScore(inputFunc):
warnings.filterwarnings("ignore", category=DeprecationWarning)
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
inp = inputFunc
print(inp)
out = dict()
strEvaluation = inp.split("most_similar ",1)[1]
#while inp != 'quit':
split_inp = inp.split()
try:
if split_inp[0] == 'help':
pass
elif split_inp[0] == 'similarity' and len(split_inp) >= 3:
pass
elif split_inp[0] == 'most_similar' and len(split_inp) >= 2:
for pair in model.most_similar(positive=[split_inp[1]]):
out.update({pair[0]: pair[1]})
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
return out
def main():
with ThreadPoolExecutor(max_workers=5) as executor:
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
#for item in executor.map(calculateScore, arg):
output = executor.map(calculateScore, arg)
return output
if __name__ == "__main__":
fullAnalysis = fetch_data()
results = main()
print(f'results: {results}')
The Python Global Interpreter Lock or GIL allows only one thread to hold control of the Python interpreter. Since your function calculateScore might be cpu-bound and requires the interpreter to execute its byte code, you may be gaining little by using threading. If, on the other hand, it were doing mostly I/O operations, it would be giving up the GIL for most of its running time allowing other threads to run. But that does not seem to be the case here. You probably should be using the ProcessPoolExecutor from concurrent.futures (try it both ways and see):
def main():
with ProcessPoolExecutor(max_workers=None) as executor:
the_futures = {}
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures[future] = i # map future to request
for future in as_completed(the_futures): # results as they become available not necessarily the order of submission
i = the_futures[future] # the original index
result = future.result() # the result
If you omit the max_workers parameter (or specify a value of None) from the ProcessPoolExecutor constructor, the default will be the number of processors you have on your machine (not a bad default). There is no point in specifying a value larger than the number of processors you have.
If you do not need to tie the future back to the original request, then the_futures can just be a list to which But simplest yest in not even to bother to use the as_completed method:
def main():
with ProcessPoolExecutor(max_workers=5) as executor:
the_futures = []
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures.append(future)
# wait for the completion of all the results and return them all:
results = [f.result() for f in the_futures()] # results in creation order
return results
It should be mentioned that code that launches the ProcessPoolExecutor functions should be in a block governed by a if __name__ = '__main__':. If it isn't you will get into a recursive loop with each subprocess launching the ProcessPoolExecutor. But that seems to be the case here. Perhaps you meant to use the ProcessPoolExecutor all along?
Also:
I don't know what the line ...
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
... in function calculateStore does. It may be the one i/o-bound statement. But this appears to be something that does not vary from call to call. If that is the case and model is not being modified in the function, shouldn't this statement be moved out of the function and computed just once? Then this function would clearly run faster (and be clearly cpu-bound).
Also:
The exception block ...
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
... is puzzling. You are inputting a value that will never be used right before returning. If this is to pause execution, there is no error message being output.
With Booboo assistance, I was able to update code to include ProcessPoolExecutor. Here is my updated code. Overall, processing has been speed up by more than 60%.
I did run into a processing issue and found this topic BrokenPoolProcess that addresses the issue.
output = {}
thePool = {}
def main(labelled_data, dictionaryRevised):
args = sys.argv[1:]
with ProcessPoolExecutor(max_workers=None) as executor:
for i in range(len(labelled_data)):
text = labelled_data['QueryText'][i]
arg = 'most_similar'+ ' '+ text
output = winprocess.submit(
executor, calculateScore, arg
)
thePool[output] = i #original index for future to request
for output in as_completed(thePool): # results as they become available not necessarily the order of submission
i = thePool[output] # the original index
text = labelled_data['QueryText'][i]
result = output.result() # the result
maximumKey = max(result.items(), key=operator.itemgetter(1))[0]
maximumValue = result.get(maximumKey)
labelled_data['SimilarText'][i] = maximumKey
labelled_data['SimilarityScore'][i] = maximumValue
return labelled_data, dictionaryRevised
if __name__ == "__main__":
start = time.perf_counter()
print("Starting to evaluate Query Text for labelling...")
output_Labelled_Data, output_dictionary_revised = preProcessor()
output,dictionary = main(output_Labelled_Data, output_dictionary_revised)
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} second(s)')
I have a Process pool in python that is starting processes as normal, however, I have just realized that these processes are not closed after the completion (I know that they completed as the last statement is a file write).
Below the code, with an example function ppp:
from multiprocessing import Pool
import itertools
def ppp(element):
window,day = element
print(window,day)
time.sleep(10)
if __name__ == '__main__': ##The line marked
print('START')
start_time = current_milli_time()
days = ['0808', '0810', '0812', '0813', '0814', '0817', '0818', '0827']
windows = [1000,2000,3000,4000,5000,10000,15000, 20000,30000,60000,120000,180000]
processes_args = list(itertools.product(windows, days))
pool = Pool(8)
results = pool.map(ppp, processes_args)
pool.close()
pool.join()
print('END', current_milli_time()-start_time)
I am working on Linux, Ubuntu 16.04. Everything was working fine before I added the line marked in the example. I am wondering if that behavior can be related to the missing of a return statement. Anyway, that is what looks like my 'htop':
As you can see, no process is closed, but all have completed their work.
I found that related question: Python Multiprocessing pool.close() and join() does not close processes, however, I have not understood if the solution to this problem is to use map_async instead of map.
EDIT: real function code:
def process_day(element):
window,day = element
noise = 0.2
print('Processing day:', day,', window:', window)
individual_files = glob.glob('datan/'+day+'/*[0-9].csv')
individual = readDataset(individual_files)
label_time = individual.loc[(individual['LABEL_O'] != -2) | (individual['LABEL_F'] != -2), 'TIME']
label_time = list(np.unique(list(label_time)))
individual = individual[individual['TIME'].isin(label_time)]
#Saving IDs for further processing
individual['ID'] = individual['COLLAR']
#Time variable in seconds for aggregation and merging
individual['TIME_S'] = individual['TIME'].copy()
noise_x = np.random.normal(0,noise,len(individual))
noise_y = np.random.normal(0,noise,len(individual))
noise_z = np.random.normal(0,noise,len(individual))
individual['X_AXIS'] = individual['X_AXIS'] + noise_x
individual['Y_AXIS'] = individual['Y_AXIS'] + noise_y
individual['Z_AXIS'] = individual['Z_AXIS'] + noise_z
#Time syncronization (applying milliseconds for time series processing)
print('Time syncronization:')
with progressbar.ProgressBar(max_value=len(individual.groupby('ID'))) as bar:
for baboon,df_baboon in individual.groupby('ID'):
times = list(df_baboon['TIME'].values)
d = Counter(times)
result = []
for timestamp in np.unique(times):
for i in range(0,d[timestamp]):
result.append(str(timestamp+i*1000/d[timestamp]))
individual.loc[individual['ID'] == baboon,'TIME'] = result
bar.update(1)
#Time series process
ts_process = time_series_processing(window, 'TIME_S', individual, 'COLLAR', ['COLLAR', 'TIME', 'X_AXIS','Y_AXIS','Z_AXIS'])
#Aggregation and tsfresh
ts_process.do_process()
individual = ts_process.get_processed_dataframe()
individual.to_csv('noise2/processed_data/'+str(window)+'/agg/'+str(day)+'.csv', index = False)
#NEtwork inference process
ni = network_inference_process(individual, 'TIME_S_mean')
#Inference
ni.do_process()
final = ni.get_processed_dataframe()
final.to_csv('noise2/processed_data/'+str(window)+'/net/'+str(day)+'.csv', index = False)
#Saving not aggregated ground truth
ground_truth = final[['ID_mean', 'TIME_S_mean', 'LABEL_O_values', 'LABEL_F_values']].copy()
#Neighbor features process
neighbors_features_f = ni.get_neighbor_features(final, 'TIME_S_mean', 'ID_mean')
neighbors_features_f = neighbors_features_f.drop(['LABEL_O_values_n', 'LABEL_F_values_n'], axis=1)
neighbors_features_f.to_csv('noise2/processed_data/'+str(window)+'/net/'+str(day)+'_neigh.csv', index = False)
# Final features dataframe
final_neigh = pd.merge(final, neighbors_features_f, how='left', left_on=['TIME_S_mean','ID_mean'], right_on = ['TIME_S_mean_n','BABOON_NODE_n'])
final_neigh.to_csv('noise2/processed_data/'+str(window)+'/complete/'+str(day)+'.csv', index = False)
return
So as you can see, the last statement is a write to file, and it is executed by all the processes, I do not actually think that the problem is inside this function.
I am working on a project to check a file directory and automatically add log files as they are created. A file is being generated every five minutes, but some of the files are being created with a "0" filesize and I would like to alert when this happens.
So the sequence of steps I would like to have are essentially:
Get time (MM:DD:YY HH:MM:SS) *Not sure if I need to do this...
CD to Folder Directory /Netflow/YY/MM/DD
Search for filename "nfcapd.YYYYMMDDHHMM" where MM increments by 5.
If filesize is 0, then email Johnny, Sally and Jimmy
Wait 6 minutes and repeat
This is what I have pieced together thus far. How can I get the desired functionality?
import os
def is_non_zero_file(fpath): storage/Netflow/
return True if os.path.isfile(fpath) and os.path.getsize(fpath) > 0 else False
# I need to check storage/Netflow for files named by time e.g 13_56_05.txt
while True:
time.sleep(360)
In addition to enumerating the files in a given path, and subsequently filtering the files which are only zero-length, you probably want to maintain some type of state to ensure you're aren't notified multiple times of the same zero length file. That is, you probably don't want to get a notification that the same file is zero-length indefinitely (although you can modify the example below if you want said behavior).
You may optionally want to do things like verify that the file name strictly meets your naming convention. You may also want to validate the the string date-stamp included in the file name is a valid datetime.
The example below uses the glob module (itself leveraging os.listdir() and fnmatch.fnmatch()) to build up a set of possible files for inclusion. [1]
The example is intentionally simple, and leverages a single class to store log sample 'state'. KEEP_SAMPLES samples are maintained (instances of logState() in the log_states list, achieved by using list slicing.
A single alert(msg) function is supplied as a stub to something that might send mail, etc...
References:
[1] https://docs.python.org/3.2/library/glob.html
#!/usr/bin/python3
import os
import glob
import re
from datetime import datetime, timezone
import time
from pprint import pprint
class logState():
def __init__(self, log_path, glob_patt, re_patt, dt_fmt):
self.dt = datetime.now(timezone.utc)
self.log_path = log_path
self.glob_patt = glob_patt
self.re_patt = re_patt
self.dt_fmt = dt_fmt
self.empty_logs = []
self.nonempty_logs = []
# Retrieve only files from glob
self.files = [ f for f in
glob.glob(self.log_path + self.glob_patt)
if os.path.isfile(f) ]
for f in self.files:
unq_fname = f.split('/')[-1]
if unq_fname == None:
continue
# Tighter pattern matching
if re.match(re_patt, unq_fname) == None:
continue
# Get the datetime portion of the file name
f_dtstamp = unq_fname.split('.')[-1]
# Make sure the datetime stamp represents
# a valid date
if datetime.strptime(f_dtstamp, self.dt_fmt) == None:
continue
# Check file size, add to the appropriate
# list
if os.path.getsize(f) <= 0:
self.empty_logs.append(f)
else:
self.nonempty_logs.append(f)
def alert(msg):
print("ALERT!: {0}".format(msg))
if __name__ == "__main__":
# How long to sleep
SLEEP_SECS = 5
# How many samples to keep
KEEP_SAMPLES = 5
log_states = []
# Definition for what logs states we'll look for
log_path = './'
glob_patt = 'nfcapd.[0-9]*'
re_patt = 'nfcapd.([0-9]{12})'
dt_fmt = "%Y%m%d%H%M"
print("-- Setup --")
print("Sample files in '{0}'".format(log_path))
print("\t{0} samples kept:".format(KEEP_SAMPLES))
print("\tglob pattern: '{0}'".format(glob_patt))
print("\tregex pattern: '{0}'".format(re_patt))
print("\tdatetime string: '{0}'".format(dt_fmt))
print("")
# Collect the initial state
log_states.append(logState(log_path,
glob_patt,
re_patt, dt_fmt))
while True:
# Print state inventory and current state detail
print( "-- Log States Stored --")
for i, log_state in enumerate(log_states):
print("Log state {0} # {1}".format(i, log_state.dt))
print(" -- Logs size > 0 --")
pprint(log_states[-1].nonempty_logs)
print(" -- Logs size <= 0 --")
pprint(log_states[-1].empty_logs)
print("")
time.sleep(SLEEP_SECS)
log_states = log_states[-KEEP_SAMPLES+1:]
log_states.append(logState(log_path,
glob_patt,
re_patt,
dt_fmt))
# p = previous sample, c = current
p = set(log_states[-2].empty_logs)
c = set(log_states[-1].empty_logs)
# only report the items in the current sample
# not in the last
if len(c.difference(p)) > 0:
alert("\nNew zero length logs: " + str(c.difference(p)) + "\n")