ASYNCIO Issues. "Future pending" - python
I am currently having issues with Joblib running multiprocessing, or a parallel program. I was able to get this to work before, and I was reaching times of 1 min total, however, I went about and changed up a lot, and messed something up. I have posted the barebones code, as I am receiving the same error with it. I am trying to loop through all 150 stock symbols, and use yahoo finance to receive the option chain for each one. I am trying to do this on a minute basis. I have also tried other libraries like asyncio, and have been unsuccessful with that. Any recommendations would be much appreciated.
import yfinance as yf
def background(f):
def wrapped(*args, **kwargs):
return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
return wrapped
done = []
#background
def downloadChain(ticker):
print(ticker)
df = pd.DataFrame()
daysOut = 100
chain = 0
try:
yf_ticker = yf.Ticker(ticker)
expiration_dates = yf_ticker.options
for expiration_date in expiration_dates:
if (datetime.fromisoformat(expiration_date) - datetime.now()).days <= daysOut:
try:
chain = yf_ticker.option_chain(expiration_date)
df = df.append(chain)
except Exception as e:
pass
except Exception as e:
pass
done.append(ticker)
Main function:
symbols = ["WATT","TSLA","UVXY","VXX","KEYS","EGO","GLD","WORK","BYND","BLK","PINS","LYFT","SPCE","PAYC","WDAY","UBER","CHGG","SHAK","CMG","CTL","ACB","TLRY","CGC","MJ","ORCL","GRUB","RNG","JWN","TTWO","ADI","ATVI","EA","SNE","GAMR","TXN","TMUS","MCHP","TSM","XBI","ETFC","MS","IWM","EXPD","RCL","CCL","MOMO","BABA","VMW","CRM","ULTA","SKYY","SPLK","FLWS","AVGO","TWTR","PANW","RJF","SABR","LOW","RS","ON","VEEV","DOCU","FB","SNAP","HPQ","RACE","F","AMAT","MRO","STM","AAL","DAL","VICR","XLC","CRON","DELL","T","VZ","S","MELI","CVM","REGN","NVAX","APT","CODX","LAKE","MRNA","EBS","INO", "SPY","SH","QQQ","XLF","KRE","XLV","HYG","LQD","NET","NFLX","ROKU","SHOP","AMZN","AAPL","MSFT","GOOGL","GOOG","NVDA","MU","AMD","INTC","MRVL","QCOMM","SQ","PYPL","TTD","TSLA","ZM","TDOC","LVGO","MDB","HD","VNQ","ARI","ACC","IIPR","EQR","EPR","SPG","PLD","ACB","WHR","NVAX","APT","MDT","CLRX","COST","SDC","LK","PVH","KSS","M","LULU","NKE","KO","BAC","JPM","CS","WFC","ARKW","ARKK","MGM","AMAT","WYNN","TGT","ITT","FXI"]
for ticker in symbols:
downloadChain(ticker)
I added a separate loop to see the size of the "done" array, which holds all the symbols that have been completed. I am unsure what i have changed, but now this loop is completing in about 10-15 mins when 1 minute is expected.
while True:
clear_output(wait=True)
print(len(done))
There are two versions of the "fix". Adding them as an answer rather than using comments as a chat :)
import asyncio
import pandas as pd
import yfinance as yf
from concurrent.futures import ThreadPoolExecutor
def background(f):
def wrapped(*args, **kwargs):
return asyncio.get_event_loop().run_in_executor(executor, f, *args, **kwargs)
return wrapped
done = []
#background
def downloadChain(ticker):
print(ticker)
df = pd.DataFrame()
daysOut = 100
chain = 0
try:
yf_ticker = yf.Ticker(ticker)
expiration_dates = yf_ticker.options
for expiration_date in expiration_dates:
if (datetime.fromisoformat(expiration_date) - datetime.now()).days <= daysOut:
try:
chain = yf_ticker.option_chain(expiration_date)
df = df.append(chain)
except Exception as e:
pass
except Exception as e:
pass
done.append(ticker)
symbols = ["WATT","TSLA","UVXY","VXX","KEYS","EGO","GLD","WORK","BYND","BLK","PINS","LYFT","SPCE","PAYC","WDAY","UBER","CHGG","SHAK","CMG","CTL","ACB","TLRY","CGC","MJ","ORCL","GRUB","RNG","JWN","TTWO","ADI","ATVI","EA","SNE","GAMR","TXN","TMUS","MCHP","TSM","XBI","ETFC","MS","IWM","EXPD","RCL","CCL","MOMO","BABA","VMW","CRM","ULTA","SKYY","SPLK","FLWS","AVGO","TWTR","PANW","RJF","SABR","LOW","RS","ON","VEEV","DOCU","FB","SNAP","HPQ","RACE","F","AMAT","MRO","STM","AAL","DAL","VICR","XLC","CRON","DELL","T","VZ","S","MELI","CVM","REGN","NVAX","APT","CODX","LAKE","MRNA","EBS","INO", "SPY","SH","QQQ","XLF","KRE","XLV","HYG","LQD","NET","NFLX","ROKU","SHOP","AMZN","AAPL","MSFT","GOOGL","GOOG","NVDA","MU","AMD","INTC","MRVL","QCOMM","SQ","PYPL","TTD","TSLA","ZM","TDOC","LVGO","MDB","HD","VNQ","ARI","ACC","IIPR","EQR","EPR","SPG","PLD","ACB","WHR","NVAX","APT","MDT","CLRX","COST","SDC","LK","PVH","KSS","M","LULU","NKE","KO","BAC","JPM","CS","WFC","ARKW","ARKK","MGM","AMAT","WYNN","TGT","ITT","FXI"]
with ThreadPoolExecutor() as executor:
for ticker in symbols:
downloadChain(ticker)
The second being more standard. In which we define an async main which we ask asyncio to use as the main entry point.
import asyncio
import pandas as pd
import yfinance as yf
from concurrent.futures import ProcessPoolExecutor
symbols = ["WATT","TSLA","UVXY","VXX","KEYS","EGO","GLD","WORK","BYND","BLK","PINS","LYFT","SPCE","PAYC","WDAY","UBER","CHGG","SHAK","CMG","CTL","ACB","TLRY","CGC","MJ","ORCL","GRUB","RNG","JWN","TTWO","ADI","ATVI","EA","SNE","GAMR","TXN","TMUS","MCHP","TSM","XBI","ETFC","MS","IWM","EXPD","RCL","CCL","MOMO","BABA","VMW","CRM","ULTA","SKYY","SPLK","FLWS","AVGO","TWTR","PANW","RJF","SABR","LOW","RS","ON","VEEV","DOCU","FB","SNAP","HPQ","RACE","F","AMAT","MRO","STM","AAL","DAL","VICR","XLC","CRON","DELL","T","VZ","S","MELI","CVM","REGN","NVAX","APT","CODX","LAKE","MRNA","EBS","INO", "SPY","SH","QQQ","XLF","KRE","XLV","HYG","LQD","NET","NFLX","ROKU","SHOP","AMZN","AAPL","MSFT","GOOGL","GOOG","NVDA","MU","AMD","INTC","MRVL","QCOMM","SQ","PYPL","TTD","TSLA","ZM","TDOC","LVGO","MDB","HD","VNQ","ARI","ACC","IIPR","EQR","EPR","SPG","PLD","ACB","WHR","NVAX","APT","MDT","CLRX","COST","SDC","LK","PVH","KSS","M","LULU","NKE","KO","BAC","JPM","CS","WFC","ARKW","ARKK","MGM","AMAT","WYNN","TGT","ITT","FXI"]
done = []
def downloadChain(ticker):
print(ticker)
df = pd.DataFrame()
daysOut = 100
chain = 0
try:
yf_ticker = yf.Ticker(ticker)
expiration_dates = yf_ticker.options
for expiration_date in expiration_dates:
if (datetime.fromisoformat(expiration_date) - datetime.now()).days <= daysOut:
try:
chain = yf_ticker.option_chain(expiration_date)
df = df.append(chain)
except Exception as e:
pass
except Exception as e:
pass
done.append(ticker)
async def main():
with ProcessPoolExecutor() as executor:
for ticker in symbols:
asyncio.get_event_loop().run_in_executor(executor, downloadChain,
ticker)
if __name__ == '__main__':
asyncio.run(main())
Here you also have more refined control over which executor to use. Basically, we explicitly code under which event loop we're working with and under which we add work to an executor. Local tests didn't show great differences between ProcessPoolExecutor and ThreadPoolExecutor.
Related
I want build a alert for ema indicator crypto in a special list
First one: ### configuration details TELEGRAM_TOKEN = '' # telegram bot token TELEGRAM_CHANNEL ='' # channel id INTERVAL = '1m' # binance time interval SHORT_EMA = 7 # short interval for ema LONG_EMA = 21 # long interval for ema Here is my second code: import requests import talib import time import numpy as np import websocket from config import TELEGRAM_TOKEN, TELEGRAM_CHANNEL , INTERVAL, SHORT_EMA , LONG_EMA def streamKline(currency, interval): websocket.enableTrace(False) socket = f'wss://stream.binance.com:9443/ws/{currency}#kline_{interval}' ws = websocket.WebSocketApp(socket) ws.run_forever() #SYMBOLS TO LOOK FOR ALERTS SYMBOLS = [ "ETHUSDT", "BTCUSDT", "ATOMUSDT", "BNBUSDT", "FTMBUSD", "ENJUSDT", "WAXPUSDT" ] #sending alerts to telegram def send_message(message): url = "https://api.telegram.org/bot{}/sendMessage?chat_id={}&text={}&parse_mode=markdown".format(TELEGRAM_TOKEN,TELEGRAM_CHANNEL,message) res = requests.get(url);print(url); return res # getting klines data to process def streamKline(symbol): data = socket.streamKline(symbol=symbol,interval=INTERVAL,limit=300) # more data means more precision but at the trade off between speed and time return_data = [] # taking closing data for each kline for each in data: return_data.append(float(each[4])) # 4 is the index of the closing data in each kline return np.array(return_data) # returning as numpy array for better precision and performance def main(): # making a infinite loop that keeps checking for condition while True: #looping through each coin for each in SYMBOLS: data = streamKline(each) ema_short = talib.EMA(data,int(SHORT_EMA)) ema_long = talib.EMA(data,int(LONG_EMA)) last_ema_short = ema_short[-2] last_ema_long = ema_long[-2] ema_short = ema_short[-1] ema_long = ema_long[-1] # conditions for alerts if(ema_short > ema_long and last_ema_short < last_ema_long): message = each + "bullcoming "+ str(SHORT_EMA) + " over "+str(LONG_EMA);print(each ,"alert came"); send_message(message); time.sleep(0.5); # calling the function if __name__ == "__main__": main() The part of config is all settle done, just second for the kline data, the error mention lot like this. data = socket.streamKline(symbol=symbol,interval=INTERVAL,limit=300) # more data means more precision but at the trade off between speed and time NameError: name 'socket' is not defined I just don't know how to do it, I want build a ema alert that can give me a message when I am not watching chart, through this way seems not work, I have tried many times, and also find many video but still, I am just an beginner, nothing improving at all.
Using concurrent.futures within a for statement
I store QuertyText within a pandas dataframe. Once I've loaded all the queries into I want to conduct an analysis again each query. Currently, I have ~50k to evaluate. So, doing it one by one, will take a long time. So, I wanted to implement concurrent.futures. How do I take the individual QueryText stored within fullAnalysis as pass it to concurrent.futures and return the output as a variable? Here is my entire code: import pandas as pd import time import gensim import sys import warnings from concurrent.futures import ThreadPoolExecutor from concurrent.futures import as_completed fullAnalysis = pd.DataFrame() def fetch_data(jFile = 'ProcessingDetails.json'): print("Fetching data...please wait") #read JSON file for latest dictionary file name baselineDictionaryFileName = 'Dictionary/Dictionary_05-03-2020.json' #copy data to pandas dataframe labelled_data = pd.read_json(baselineDictionaryFileName) #Add two more columns to get the most similar text and score labelled_data['SimilarText'] = '' labelled_data['SimilarityScore'] = float() print("Data fetched from " + baselineDictionaryFileName + " and there are " + str(labelled_data.shape[0]) + " rows to be evalauted") return labelled_data def calculateScore(inputFunc): warnings.filterwarnings("ignore", category=DeprecationWarning) model = gensim.models.Word2Vec.load('w2v_model_bigdata') inp = inputFunc print(inp) out = dict() strEvaluation = inp.split("most_similar ",1)[1] #while inp != 'quit': split_inp = inp.split() try: if split_inp[0] == 'help': pass elif split_inp[0] == 'similarity' and len(split_inp) >= 3: pass elif split_inp[0] == 'most_similar' and len(split_inp) >= 2: for pair in model.most_similar(positive=[split_inp[1]]): out.update({pair[0]: pair[1]}) except KeyError as ke: #print(str(ke) + "\n") inp = input() return out def main(): with ThreadPoolExecutor(max_workers=5) as executor: for i in range(len(fullAnalysis)): text = fullAnalysis['QueryText'][i] arg = 'most_similar'+ ' ' + text #for item in executor.map(calculateScore, arg): output = executor.map(calculateScore, arg) return output if __name__ == "__main__": fullAnalysis = fetch_data() results = main() print(f'results: {results}')
The Python Global Interpreter Lock or GIL allows only one thread to hold control of the Python interpreter. Since your function calculateScore might be cpu-bound and requires the interpreter to execute its byte code, you may be gaining little by using threading. If, on the other hand, it were doing mostly I/O operations, it would be giving up the GIL for most of its running time allowing other threads to run. But that does not seem to be the case here. You probably should be using the ProcessPoolExecutor from concurrent.futures (try it both ways and see): def main(): with ProcessPoolExecutor(max_workers=None) as executor: the_futures = {} for i in range(len(fullAnalysis)): text = fullAnalysis['QueryText'][i] arg = 'most_similar'+ ' ' + text future = executor.submit(calculateScore, arg) the_futures[future] = i # map future to request for future in as_completed(the_futures): # results as they become available not necessarily the order of submission i = the_futures[future] # the original index result = future.result() # the result If you omit the max_workers parameter (or specify a value of None) from the ProcessPoolExecutor constructor, the default will be the number of processors you have on your machine (not a bad default). There is no point in specifying a value larger than the number of processors you have. If you do not need to tie the future back to the original request, then the_futures can just be a list to which But simplest yest in not even to bother to use the as_completed method: def main(): with ProcessPoolExecutor(max_workers=5) as executor: the_futures = [] for i in range(len(fullAnalysis)): text = fullAnalysis['QueryText'][i] arg = 'most_similar'+ ' ' + text future = executor.submit(calculateScore, arg) the_futures.append(future) # wait for the completion of all the results and return them all: results = [f.result() for f in the_futures()] # results in creation order return results It should be mentioned that code that launches the ProcessPoolExecutor functions should be in a block governed by a if __name__ = '__main__':. If it isn't you will get into a recursive loop with each subprocess launching the ProcessPoolExecutor. But that seems to be the case here. Perhaps you meant to use the ProcessPoolExecutor all along? Also: I don't know what the line ... model = gensim.models.Word2Vec.load('w2v_model_bigdata') ... in function calculateStore does. It may be the one i/o-bound statement. But this appears to be something that does not vary from call to call. If that is the case and model is not being modified in the function, shouldn't this statement be moved out of the function and computed just once? Then this function would clearly run faster (and be clearly cpu-bound). Also: The exception block ... except KeyError as ke: #print(str(ke) + "\n") inp = input() ... is puzzling. You are inputting a value that will never be used right before returning. If this is to pause execution, there is no error message being output.
With Booboo assistance, I was able to update code to include ProcessPoolExecutor. Here is my updated code. Overall, processing has been speed up by more than 60%. I did run into a processing issue and found this topic BrokenPoolProcess that addresses the issue. output = {} thePool = {} def main(labelled_data, dictionaryRevised): args = sys.argv[1:] with ProcessPoolExecutor(max_workers=None) as executor: for i in range(len(labelled_data)): text = labelled_data['QueryText'][i] arg = 'most_similar'+ ' '+ text output = winprocess.submit( executor, calculateScore, arg ) thePool[output] = i #original index for future to request for output in as_completed(thePool): # results as they become available not necessarily the order of submission i = thePool[output] # the original index text = labelled_data['QueryText'][i] result = output.result() # the result maximumKey = max(result.items(), key=operator.itemgetter(1))[0] maximumValue = result.get(maximumKey) labelled_data['SimilarText'][i] = maximumKey labelled_data['SimilarityScore'][i] = maximumValue return labelled_data, dictionaryRevised if __name__ == "__main__": start = time.perf_counter() print("Starting to evaluate Query Text for labelling...") output_Labelled_Data, output_dictionary_revised = preProcessor() output,dictionary = main(output_Labelled_Data, output_dictionary_revised) finish = time.perf_counter() print(f'Finished in {round(finish-start, 2)} second(s)')
Python freezes during for loop
def get_price_history_data(ticker): pricelist = [] try: pricedata = False tradingdays = 252 Historical_Prices = pdr.get_data_yahoo(symbols=ticker, start=(datetime.today()-timedelta(tradingdays)), end=(datetime.today()))#-timedelta(years4-1))) price_df = pd.DataFrame(Historical_Prices) pricelist = price_df['Adj Close'] pricedata = True except: print(ticker,' failed to get price data') return(pricelist, pricedata) tickers = ['FB','V'] for ticker in tickers: [pricelist, pricedata] = get_price_data(ticker) I have a list of a few thousand tickers that i run through this for loop. It outputs a single column df and a boolean. Overall it works just fine and does what I need it to. However, it inconsistently freezes indefinitely with no error message and stops running forcing me to close the program and re-run from the beginning. I am looking for a way for me to skip the iteration of the for loop if a certain amount of time has passed. I have looked into the time.sleep() and the continue function but cant figure out how to apply it to this specific application. If it freezes, it freezes on the "pdr.get_data_yahoo() section". Help would be apprec
I'm guessing that get_data_yahoo() probably freezes because it's making some kind of request to a server that never gets answered. It doesn't have a timeout option so the most obvious option is to start it in another thread/process and terminate it if it takes too long. You can use concurrent.futures for that. Once you're happy about how the code below works, you can replace sleeps_for_a_while with get_price_history_data and (3, 1, 4, 0) with tickers. from concurrent.futures import ThreadPoolExecutor, TimeoutError from time import sleep TIMEOUT = 2 # seconds def sleeps_for_a_while(sleep_for): print('starting {}s sleep'.format(sleep_for)) sleep(sleep_for) print('finished {}s sleep'.format(sleep_for)) # return a value to break out of the while loop return sleep_for * 100 if __name__ == '__main__': # this only works with functions that return values results = [] for sleep_secs in (3, 1, 4, 0): with ThreadPoolExecutor(max_workers=1) as executor: # a future represents something that will be done future = executor.submit(sleeps_for_a_while, sleep_secs) try: # result() raises an error if it times out results.append(future.result(TIMEOUT)) except TimeoutError as e: print('Function timed out') results.append(None) print('Got results:', results)
Running for loop in parallel via python
I have a process that loops over a list of IP addresses and returns some information about them. The simple for loop works great, my issue is running this at scale due to Python's Global Interpreter lock (GIL). My goal is to have this function run in parallel and take full use of my 4 cores. This way when I run 100K of these it won't take me 24 hours via a normal for loop. After reading others answers on here, particularly this one, How do I parallelize a simple Python loop?, I decided to use joblib. When I run 10 records thru it(example above), it took over 10 minutes to run. This doesn't sound like it's working right. I know there is something i'm doing wrong or not understanding. Any help is greatly appreciated! import pandas as pd import numpy as np import os as os from ipwhois import IPWhois from joblib import Parallel, delayed import multiprocessing num_core = multiprocessing.cpu_count() iplookup = ['174.192.22.197',\ '70.197.71.201',\ '174.195.146.248',\ '70.197.15.130',\ '174.208.14.133',\ '174.238.132.139',\ '174.204.16.10',\ '104.132.11.82',\ '24.1.202.86',\ '216.4.58.18'] Normal for loop which works fine! asn=[] asnid=[] asncountry=[] asndesc=[] asnemail = [] asnaddress = [] asncity = [] asnstate = [] asnzip = [] asndesc2 = [] ipaddr=[] b=1 totstolookup=len(iplookup) for i in iplookup: i = str(i) print("Running #{} out of {}".format(b,totstolookup)) try: obj=IPWhois(i,timeout=15) result=obj.lookup_whois() asn.append(result['asn']) asnid.append(result['asn_cidr']) asncountry.append(result['asn_country_code']) asndesc.append(result['asn_description']) try: asnemail.append(result['nets'][0]['emails']) asnaddress.append(result['nets'][0]['address']) asncity.append(result['nets'][0]['city']) asnstate.append(result['nets'][0]['state']) asnzip.append(result['nets'][0]['postal_code']) asndesc2.append(result['nets'][0]['description']) ipaddr.append(i) except: asnemail.append(0) asnaddress.append(0) asncity.append(0) asnstate.append(0) asnzip.append(0) asndesc2.append(0) ipaddr.append(i) except: pass b+=1 Function to to pass to joblib to run on all cores! def run_ip_process(iplookuparray): asn=[] asnid=[] asncountry=[] asndesc=[] asnemail = [] asnaddress = [] asncity = [] asnstate = [] asnzip = [] asndesc2 = [] ipaddr=[] b=1 totstolookup=len(iplookuparray) for i in iplookuparray: i = str(i) print("Running #{} out of {}".format(b,totstolookup)) try: obj=IPWhois(i,timeout=15) result=obj.lookup_whois() asn.append(result['asn']) asnid.append(result['asn_cidr']) asncountry.append(result['asn_country_code']) asndesc.append(result['asn_description']) try: asnemail.append(result['nets'][0]['emails']) asnaddress.append(result['nets'][0]['address']) asncity.append(result['nets'][0]['city']) asnstate.append(result['nets'][0]['state']) asnzip.append(result['nets'][0]['postal_code']) asndesc2.append(result['nets'][0]['description']) ipaddr.append(i) except: asnemail.append(0) asnaddress.append(0) asncity.append(0) asnstate.append(0) asnzip.append(0) asndesc2.append(0) ipaddr.append(i) except: pass b+=1 ipdataframe = pd.DataFrame({'ipaddress':ipaddr, 'asn': asn, 'asnid':asnid, 'asncountry':asncountry, 'asndesc': asndesc, 'emailcontact': asnemail, 'address':asnaddress, 'city':asncity, 'state': asnstate, 'zip': asnzip, 'ipdescrip':asndesc2}) return ipdataframe run process using all cores via joblib Parallel(n_jobs=num_core)(delayed(run_ip_process)(iplookuparray) for i in iplookup)
How can I make this code more pythonic?
I am reading a bunch of daily files and using glob to concatenate them all together into separate dataframes.I eventually join them together and basically create a single large file which I use to connect to a dashboard. I am not too familiar with Python but I used pandas and sklearn often. As you can see, I am basically just reading the last 60 (or more) days worth of data (last 60 files) and creating a dataframe for each. This works, but I am wondering if there is a more pythonic/better way? I watched a video on pydata (about not being restricted by PEP 8 and making sure your code is pythonic) which was interesting. (FYI - the reason why I need to read 60 days worth of time is because customers can fill out a survey from a call which happened a long time ago. The customer fills out a survey today about a call that happened in July. I need to know about that call (how long it lasted, what the topic was, etc). os.chdir(r'C:\\Users\Documents\FTP\\') loc = r'C:\\Users\Documents\\' rosterloc = r'\\mand\\' splitsname = r'Splits.csv' fcrname = r'global_disp_' npsname = r'survey_' ahtname = r'callbycall_' rostername = 'Daily_Roster.csv' vasname = r'vas_report_' ext ='.csv' startdate = dt.date.today() - Timedelta('60 day') enddate = dt.date.today() daterange = Timestamp(enddate) - Timestamp(startdate) daterange = (daterange / np.timedelta64(1, 'D')).astype(int) data = [] frames = [] calls = [] bracket = [] try: for date_range in (Timestamp(startdate) + dt.timedelta(n) for n in range(daterange)): aht = pd.read_csv(ahtname+date_range.strftime('%Y_%m_%d')+ext) calls.append(aht) except IOError: print('File does not exist:', ahtname+date_range.strftime('%Y_%m_%d')+ext) aht = pd.concat(calls) print('AHT Done') try: for date_range in (Timestamp(startdate) + dt.timedelta(n) for n in range(daterange)): fcr = pd.read_csv(fcrname+date_range.strftime('%m_%d_%Y')+ext, parse_dates = ['call_time']) data.append(fcr) except IOError: print('File does not exist:', fcrname+date_range.strftime('%m_%d_%Y')+ext) fcr = pd.concat(data) print('FCR Done') try: for date_range in (Timestamp(enddate) - dt.timedelta(n) for n in range(3)): nps = pd.read_csv(npsname+date_range.strftime('%m_%d_%Y')+ext, parse_dates = ['call_date','date_completed']) frames.append(nps) except IOError: print('File does not exist:', npsname+date_range.strftime('%m_%d_%Y')+ext) nps = pd.concat(frames) print('NPS Done') try: for date_range in (Timestamp(startdate) + dt.timedelta(n) for n in range(daterange)): vas = pd.read_csv(vasname+date_range.strftime('%m_%d_%Y')+ext, parse_dates = ['Call_date']) bracket.append(vas) except IOError: print('File does not exist:', vasname+date_range.strftime('%m_%d_%Y')+ext) vas = pd.concat(bracket) print('VAS Done') roster = pd.read_csv(loc+rostername) print('Roster Done') splits = pd.read_csv(loc+splitsname) print('Splits Done')
I didn't change names, but IMHO they should be more verbose eg. pd == panda? Not sure. Here is some more pythonic way to write it: from functools import partial import logging from operator import add, sub import os import datetime as dt import contextlib os.chdir(r'C:\\Users\Documents\FTP\\') location = r'C:\\Users\Documents\\' roster_location = r'\\mand\\' splits_name = r'Splits.csv' fcr_name = r'global_disp_' nps_name = r'survey_' aht_name = r'callbycall_' roster_name = 'Daily_Roster.csv' vas_name = r'vas_report_' ext = '.csv' start_date = dt.date.today() - Timedelta('60 day') end_date = dt.date.today() daterange = Timestamp(end_date) - Timestamp(start_date) daterange = (daterange / np.timedelta64(1, 'D')).astype(int) logger = logging.getLogger() # logger is better than "print" in case, when you have multiple tiers to log. In this case: regular debug and exceptions def timestamps_in_range(daterange, method=add): # injected operation method instead of "if" statement in case of subtracting for n in xrange(daterange): yield method(Timestamp(start_date), dt.timedelta(n)) # use generators for creating series of data in place def read_csv(name, date_range, **kwargs): # use functions/methods to shorten (make more readable) long, repetitive method invocation return pd.read_csv(name + date_range.strftime('%Y_%m_%d') + ext, kwargs) def log_done(module): # use functions/methods to shorten (make more readable) long, repetitive method invocation logger.debug("%s Done" % module) #contextlib.contextmanager #contextmanager is great to separate business logic from exception handling def mapper(function, iterable): try: yield map(function, iterable) # map instead of executing function in "for" loop except IOError, err: logger.error('File does not exist: ', err.filename) # Following code is visualy tight and cleaner. # Shows only what's needed, hiding most insignificant details and repetitive code read_csv_aht = partial(read_csv, aht_name) # partial pre-fills function (first argument) with arguments of this function (remaining arguments). In this case it is useful for feeding "map" function - it takes one-argument function to execute on each element of a list with mapper(read_csv_aht, timestamps_in_range(daterange)) as calls: # contextmanager beautifully hides "dangerous" content, sharing only the "safe" result to be used aht = pd.concat(calls) log_done('AHT') read_csv_fcr = partial(read_csv, fcr_name) with mapper(read_csv_fcr, timestamps_in_range(daterange)) as data: fcr = pd.concat(data) log_done('FCR') read_csv_nps = partial(read_csv, nps_name, parse_dates=['call_date', 'date_completed']) with mapper(read_csv_nps, timestamps_in_range(3, sub)) as frames: nps = pd.concat(frames) log_done('NPS') read_csv_vas = partial(read_csv, vas_name, parse_dates=['Call_date']) with mapper(read_csv_vas, timestamps_in_range(daterange)) as bracket: vas = pd.concat(bracket) log_done('VAS') roster = pd.read_csv(location + roster_name) log_done('Roster') splits = pd.read_csv(location + splits_name) log_done('Splits')