i am working with asammdf to plot signals and now i have a problem, how can i rename the channel groups with the signal names?
here is my code:
mdf = MDF()
sigs = []
for equipment in table_list:
print("Table name:", equipment[0])
df = pd.read_sql_query('select * from ' + equipment[0], con)
df = df.replace(np.nan, 0)
if equipment[0] == 'state':
df_time = (df['id']-df.iloc[0]['id'])
else:
df_time = (df['ts']-df.iloc[0]['ts']) * 1e-6
df.pop('ts')
sigs = []
for signal in df.columns.to_list():
df[signal]
test_signal = Signal(samples=df[signal], timestamps=df_time,
name=signal,
unit='')
sigs.append(test_signal)
mdf.append(sigs)
mdf.save('..\\Output\\test_complete.mf4', overwrite=True)
You should change the channel group comment
mdf = MDF()
sigs = []
for equipment in table_list:
print("Table name:", equipment[0])
df = pd.read_sql_query('select * from ' + equipment[0], con)
df = df.replace(np.nan, 0)
if equipment[0] == 'state':
df_time = (df['id']-df.iloc[0]['id'])
else:
df_time = (df['ts']-df.iloc[0]['ts']) * 1e-6
df.pop('ts')
sigs = []
for signal in df.columns.to_list():
df[signal]
test_signal = Signal(samples=df[signal], timestamps=df_time,
name=signal,
unit='')
sigs.append(test_signal)
mdf.append(sigs)
channel_group = mdf.groups[-1].channel_group
channel_group.comment = "fancy name"
mdf.save('..\\Output\\test_complete.mf4', overwrite=True)
Related
here is the code :
**tickers = ['AMZN','AAPL','MSFT','DIS','GOOG']
# Created individual dataframes for each category of data and tickers
BS0=yfs.get_balance_sheet(tickers[0])
IS0=yfs.get_income_statement(tickers[0])
CF0=yfs.get_cash_flow(tickers[0])
BS0.columns = ['Period0','Period1','Period2','Period3']
IS0.columns = ['Period0','Period1','Period2','Period3']
CF0.columns = ['Period0','Period1','Period2','Period3']
BS0.columns.name = tickers[0]
IS0.columns.name = tickers[0]
CF0.columns.name = tickers[0]
BS1=yfs.get_balance_sheet(tickers[1])
IS1=yfs.get_income_statement(tickers[1])
CF1=yfs.get_cash_flow(tickers[1])
BS1.columns = ['Period0','Period1','Period2','Period3']
IS1.columns = ['Period0','Period1','Period2','Period3']
CF1.columns = ['Period0','Period1','Period2','Period3']
BS1.columns.name = tickers[1]
IS1.columns.name = tickers[1]
CF1.columns.name = tickers[1]
BS2=yfs.get_balance_sheet(tickers[2])
IS2=yfs.get_income_statement(tickers[2])
CF2=yfs.get_cash_flow(tickers[2])
BS2.columns = ['Period0','Period1','Period2','Period3']
IS2.columns = ['Period0','Period1','Period2','Period3']
CF2.columns = ['Period0','Period1','Period2','Period3']
BS2.columns.name = tickers[2]
IS2.columns.name = tickers[2]
CF2.columns.name = tickers[2]
BS3=yfs.get_balance_sheet(tickers[3])
IS3=yfs.get_income_statement(tickers[3])
CF3=yfs.get_cash_flow(tickers[3])
BS3.columns = ['Period0','Period1','Period2','Period3']
IS3.columns = ['Period0','Period1','Period2','Period3']
CF3.columns = ['Period0','Period1','Period2','Period3']
BS3.columns.name = tickers[3]
IS3.columns.name = tickers[3]
CF3.columns.name = tickers[3]
BS4=yfs.get_balance_sheet(tickers[4])
IS4=yfs.get_income_statement(tickers[4])
CF4=yfs.get_cash_flow(tickers[4])
BS4.columns = ['Period0','Period1','Period2','Period3']
IS4.columns = ['Period0','Period1','Period2','Period3']
CF4.columns = ['Period0','Period1','Period2','Period3']
BS4.columns.name = tickers[4]
IS4.columns.name = tickers[4]
CF4.columns.name = tickers[4]**
I have tried writing with for ticker in tickers logic and then converting to data frame with pandas, but this gives me a huge data frame with all the information packed into individual cells instead of columns and I have no idea how to spread it out in a way that makes sense for referencing.
Maybe there is a way to do this or simply to create a loop to save different data frames such as the code above but with less lines.
Thanks in advance
You can try maintaining a dictionary of dataframes
import pandas as pd
tickers = ['AMZN','AAPL','MSFT','DIS','GOOG']
column_names = ['Period0','Period1','Period2','Period3']
ticker_dfs ={}
for index, ticker in enumerate(tickers):
bs_index = 'BS' + str(index)
is_index = 'IS' + str(index)
cf_index = 'CF' + str(index)
ticker_dfs[bs_index] = yfs.get_balance_sheet(ticker)
ticker_dfs[bs_index].columns = column_names
ticker_dfs[is_index] = yfs.get_income_statement(ticker)
ticker_dfs[is_index].columns = column_names
ticker_dfs[cf_index] = yfs.get_cash_flow(ticker)
ticker_dfs[cf_index].columns = column_names
I am trying to run the following python script on Pythonanywhere.com. The script is struck at await function in the main function (see while loop). This code is working perfectly when I run on my computer without errors.
Code snip
async def main(coin, minQty, minNotional, histdata, open_position=False, cross_below=True, gotbalance=False):
bm = BinanceSocketManager(client)
ts = bm.trade_socket(coin)
last_high = 0
async with ts as tscm:
while True:
res = await tscm.recv()
if res:
frame = createframe(res)
if last_high < frame.Price[0]:
last_high = frame.Price[0]
Full Code
import nest_asyncio
import asyncio
import pandas as pd
from binance import BinanceSocketManager
from binance.client import Client
from keys import api_key, api_secret
import numpy as np
from datetime import datetime
import winsound
import urllib.request
nest_asyncio.apply()
client = Client(api_key, api_secret)
ST = 7
LT = 25
coin = 'BTCUSDT'
def gethistoricals(symbol, LT):
df = pd.DataFrame(client.get_historical_klines(symbol, '5m', str((LT - 1) * 5) + 'mins ago UTC'))
df = df.iloc[:, :6]
df.columns = ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
df.Open = df.Open.astype('float')
df.High = df.High.astype('float')
df.Low = df.Low.astype('float')
df.Close = df.Close.astype('float')
df.Volume = df.Volume.astype('float')
df['ST'] = df.Close.rolling(ST - 1).mean()
df['LT'] = df.Close.rolling(LT - 1).mean()
high_low = df['High'] - df['Low']
high_close = np.abs(df['High'] - df['Close'].shift())
low_close = np.abs(df['Low'] - df['Close'].shift())
ranges = pd.concat([high_low, high_close, low_close], axis=1)
true_range = np.max(ranges, axis=1)
df['ATR'] = true_range.rolling(14).sum()/14
df['Datetime'] = df['Timestamp'].iloc[-1]
df.Datetime = pd.to_datetime(df.Datetime, unit='ms')
df.dropna(inplace=True)
# print(df)
return df
def liveSMA(hist, live):
liveST = (hist['ST'].values + live.Price.values) / ST
liveLT = (hist['LT'].values + live.Price.values) / LT
return liveST, liveLT
def createframe(msg):
df = pd.DataFrame([msg])
df = df.loc[:, ['E', 's', 'E', 'p']]
df.columns = ['Timestamp', 'symbol', 'Time', 'Price']
df.Price = df.Price.astype(float)
df.Time = pd.to_datetime(df.Time, unit='ms')
return df
pnl = []
async def main(coin, minQty, minNotional, histdata, open_position=False, cross_below=True, gotbalance=False):
bm = BinanceSocketManager(client)
ts = bm.trade_socket(coin)
last_high = 0
async with ts as tscm:
while True:
res = await tscm.recv()
if res:
frame = createframe(res)
if last_high < frame.Price[0]:
last_high = frame.Price[0]
last_hist_time = datetime.strptime(
str(np.datetime_as_string(histdata['Datetime'].values[0], unit='ms')), '%Y-%m-%dT%H:%M:%S.%f')
if int((frame.Time[0] - last_hist_time).total_seconds()) >= 305:
histdata = gethistoricals(coin, LT)
livest, livelt, atr = histdata['ST'].values, histdata['LT'].values, histdata['ATR'].values
try:
if not open_position:
if not gotbalance:
balance = client.get_asset_balance(asset='USDT')
balance = float(balance['free'])*0.95
gotbalance = True
if balance >= minNotional:
qty = round(balance/frame.Price[0], 5)
print(qty)
print(livest[0] > livelt[0], cross_below, qty >= minQty)
else:
print('Minimum Notional Error')
if livest[0] > livelt[0] and cross_below and qty >= minQty:
order = client.create_order(symbol=coin,
side='BUY',
type='MARKET',
quantity=qty)
print(order)
buyprice = float(order['fills'][0]['price'])
winsound.Beep(1000, 100)
open_position = True
cross_below = False
if open_position:
if frame.Price[0] < (last_high - (2 * atr[0])) or livest[0] < livelt[0]:
print(f'SL Trigger: {frame.Price[0] < (last_high - (2 * atr[0]))}')
print(f'Cross Trigger: {livest[0] < livelt[0]}')
order = client.create_order(symbol=coin,
side='SELL',
type='MARKET',
quantity=qty)
print(order)
sellprice = float(order['fills'][0]['price'])
winsound.Beep(1000, 100)
open_position = False
print(f'PnL : {(buyprice - sellprice) / buyprice}')
pnl.append((buyprice - sellprice) / buyprice)
print(f'Mean PnL : {np.mean(pnl)}')
if livest[0] < livelt[0]:
cross_below = True
except Exception as e:
print(e)
if __name__ == "__main__":
winsound.Beep(1000, 100)
info = client.get_symbol_info(coin)
# print(info)
minQty = float(info['filters'][2]['minQty'])
minNotional = float(info['filters'][3]['minNotional'])
historicals = gethistoricals(coin, LT)
loop = asyncio.get_event_loop()
loop.run_until_complete(main(coin, minQty, minNotional, historicals))
Thanks
I am working on some python code to predict Default rate of loans handed out by a bank.
I have calculated the WOE and information value (IV) on the training set
(using the following code: https://github.com/Sundar0989/WOE-and-IV/blob/master/WOE_IV.ipynb?fbclid=IwAR1MvEfyGsdyTre0uPJC5WRl91dfue_t0vH5qJezwm2mAg6sjHZJg9MyDYo).
We have also concluded 2 high cardinality variables. We don't know however how to add these WOE scores to the whole set. How do we tackle this problem? How can we go further to use WOE to predict the target variable?
code:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy, pylab
Reading the data received from bank, feature selection part 1, splitting up whole set (Training) into training set: indices_traintrain, validation set: indices_val and test set: indices_test (70/30 split training and validation set - test set and 70/30 split training - validation)
Training =
pd.read_excel('/Users/enjo/Documents/Master/DM/Data_DSC2019_STUDENTS/DSC2019_Training.xlsx', na_values=np.nan)
Status = Training.iloc[:,-1]
Data = Training.iloc[:,0:45]
Data_missing = Data.isna()
Data_missing = Data_missing.sum()
print(Data_missing/len(Data))
"""
drop variables with more than 80% missing
"""
Drop = ['FREE_CASH_FLOW_AMT',
'A2_MTHS_FIRST_PCX_COREPROF_CNT', 'A2_MONTHS_IN_BELGIUM_CNT', 'A2_MTHS_SNC_FIRST_COREPROF_CNT', 'MONTHS_SINCE_LAST_REFUSAL_CNT']
DroppedTraining = Training.copy()
for element in Drop:
DroppedTraining.drop(element, axis=1,inplace=True)
import numpy as np
from sklearn import datasets
from sklearn import svm
from sklearn import preprocessing
Data_preprocessed=[] #contains preprocessed data
from Preprocessing_continuous import Preprocessing_continuous #import function for preprocessing
from Preprocessing_discrete import Preprocessing_discrete #import function for preprocessing
from sklearn.model_selection import train_test_split
indices=np.arange(26962)
indices_train, indices_test = train_test_split(indices, test_size=0.3, random_state=0)
indices_traintrain, indices_val = train_test_split(indices_train, test_size=0.3, random_state=0)
Training['target']= Training['Label_Default'].apply(lambda x:1 if x=='Y' else 0)
Highcardinalityset=[]
Highcardinalityset = Training[['Type',
'INDUSTRY_CD_3',
'INDUSTRY_CD_4',
'Managing_Sales_Office_Nbr',
'Postal_Code_L',
'Product_Desc',
'CREDIT_TYPE_CD',
'ACCOUNT_PURPOSE_CD',
'A2_MARITAL_STATUS_CD',
'FINANCIAL_PRODUCT_TYPE_CD',
'A2_EMPLOYMENT_STATUS_CD',
'A2_RESIDENT_STATUS_CD',
'target']]
Highcardinalityset = Highcardinalityset.iloc[indices_traintrain]
function found on github
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string
max_bin = 20
force_bin = 3
# define a binning function
def mono_bin(Y, X, n = max_bin):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
r = 0
while np.abs(r) < 1:
try:
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
d2 = d1.groupby('Bucket', as_index=True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
except Exception as e:
n = n - 1
if len(d2) == 1:
n = force_bin
bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
if len(np.unique(bins)) == 2:
bins = np.insert(bins, 0, 1)
bins[1] = bins[1]-(bins[1]/2)
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)})
d2 = d1.groupby('Bucket', as_index=True)
d3 = pd.DataFrame({},index=[])
d3["MIN_VALUE"] = d2.min().X
d3["MAX_VALUE"] = d2.max().X
d3["COUNT"] = d2.count().Y
d3["EVENT"] = d2.sum().Y
d3["NONEVENT"] = d2.count().Y - d2.sum().Y
d3=d3.reset_index(drop=True)
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
return(d3)
def char_bin(Y, X):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
df2 = notmiss.groupby('X',as_index=True)
d3 = pd.DataFrame({},index=[])
d3["COUNT"] = df2.count().Y
d3["MIN_VALUE"] = df2.sum().Y.index
d3["MAX_VALUE"] = d3["MIN_VALUE"]
d3["EVENT"] = df2.sum().Y
d3["NONEVENT"] = df2.count().Y - df2.sum().Y
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
d3 = d3.reset_index(drop=True)
return(d3)
def data_vars(df1, target):
stack = traceback.extract_stack()
filename, lineno, function_name, code = stack[-2]
vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
final = (re.findall(r"[\w']+", vars_name))[-1]
x = df1.dtypes.index
count = -1
for i in x:
if i.upper() not in (final.upper()):
if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
conv = mono_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
else:
conv = char_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
if count == 0:
iv_df = conv
else:
iv_df = iv_df.append(conv,ignore_index=True)
iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
iv = iv.reset_index()
return(iv_df,iv)
final_iv, IV = data_vars(Highcardinalityset,Highcardinalityset.target)
final_iv
IV.sort_values('IV')
IV.to_csv('test.csv')
transform_vars_list = Highcardinalityset.columns.difference(['target'])
transform_prefix = 'new_' # leave this value blank if you need replace the original column values
transform_vars_list
for var in transform_vars_list:
small_df = final_iv[final_iv['VAR_NAME'] == var]
transform_dict = dict(zip(small_df.MAX_VALUE.astype(str),small_df.WOE.astype(str)))
replace_cmd = ''
replace_cmd1 = ''
for i in sorted(transform_dict.items()):
replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
replace_cmd = replace_cmd + '0'
replace_cmd1 = replace_cmd1 + '0'
if replace_cmd != '0':
try:
Highcardinalityset[transform_prefix + var] = Highcardinalityset[var].apply(lambda x: eval(replace_cmd))
except:
Highcardinalityset[transform_prefix + var] = Highcardinalityset[var].apply(lambda x: eval(replace_cmd1))
Highcardinalityset['Postal_Code_L'].value_counts()
Highcardinalityset['new_Postal_Code_L'].value_counts()
Highcardinalityset['Managing_Sales_Office_Nbr'].value_counts()
Highcardinalityset['new_Managing_Sales_Office_Nbr'].value_counts()
Nice to see when high WOE: interesting for that postal code: high risk for default!
Highcardinalityset.to_excel("Highcardinalitysettraintrain.xlsx")
TrainingWOE = DroppedTraining[['Managing_Sales_Office_Nbr', "Postal_Code_L"]]
TrainingWOE["Postal_Code_L_WOE"]=Highcardinalityset[["new_Postal_Code_L"]]
TrainingWOE["Managing_Sales_Office_Nbr_WOE"]=Highcardinalityset[["new_Managing_Sales_Office_Nbr"]]
drop variables that are not relevant because of low IV value
Drop = ["ACCOUNT_PURPOSE_CD", "A2_MARITAL_STATUS_CD", "A2_EMPLOYMENT_STATUS_CD", "A2_RESIDENT_STATUS_CD",
"INDUSTRY_CD_3", "INDUSTRY_CD_4","Type"]
DroppedTrainingAfterIVcalc = DroppedTraining.copy()
for element in Drop:
DroppedTrainingAfterIVcalc.drop(element, axis=1,inplace=True)
preprocess remaining (44-5 (because of too many missing) - 7 (because of low iv) + 1 (target variable added))
Thanks for asking this question. Here is the code to do the required transformation which is shown in the notebook as well.
transform_vars_list = df.columns.difference(['target'])
transform_prefix = 'new_' # leave this value blank to replace the original column
#apply transformations
for var in transform_vars_list:
small_df = final_iv[final_iv['VAR_NAME'] == var]
transform_dict = dict(zip(small_df.MAX_VALUE,small_df.WOE))
replace_cmd = ''
replace_cmd1 = ''
for i in sorted(transform_dict.items()):
replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
replace_cmd = replace_cmd + '0'
replace_cmd1 = replace_cmd1 + '0'
if replace_cmd != '0':
try:
df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd))
except:
df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd1))
In addition, there is a package Xverse which does the same. Please refer to it here - https://github.com/Sundar0989/XuniVerse
I have an interesting problem. I have two files, NYPD_Motor_Collisions.csv has 1.2M lines and weatherfinal.txt has 109K lines. The objective is to merge the temp and prec data from weatherfinal.txt to the Collisions files as two columns based on the latitudes and longitudes. I wrote the following code using dataframe in pandas python.
from math import cos, asin, sqrt
import pandas as pd
import numpy as np
import os
import re
import datetime
def distance(lat1, lon1, lat2, lon2):
p = 0.017453292519943295
a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
return 12742 * asin(sqrt(a))
def closest(data, v):
return min(data, key=lambda p: distance(v['lat'],v['lon'],p['lat'],p['lon']))
tempDataList = []
#v = {'lat': 39.7622290, 'lon': -86.1519750}
#print(closest(tempDataList, v))
print os.getcwd()
filed_ = open("weatherfinal.txt", 'r')
fileo_ = open("weatherfinal_updated.txt","w")
lines_ = filed_.readlines()
for line_ in lines_:
outline = re.sub(" +"," ",line_)
fileo_.write(outline + "\n")
fileo_.close()
df = pd.read_csv("NYPD_Motor_Vehicle_Collisions.csv")
colhead = np.append(df.columns.values,['TEMP', 'PREP'])
outdf = pd.DataFrame(columns=colhead)
df2 = pd.read_csv("weatherfinal_updated.txt",' ')
df2.set_index(['WBANNO', 'LST_DATE', 'LST_TIME'])
sensorIds = df2['WBANNO'].unique()
for ids_ in sensorIds:
longitude = df2.loc[df2['WBANNO']==ids_,'LONGITUDE'].iloc[0]
latitude = df2.loc[df2['WBANNO'] == ids_, 'LATITUDE'].iloc[0]
tempDataList.append({'lat':latitude,'lon':longitude,'SENSORID': ids_ })
print tempDataList
for index, row in df.iterrows():
lon_ = row['LONGITUDE']
lat_ = row['LATITUDE']
tdate = row['DATE']
ttime = row['TIME']
tcal = 5
pcal = 0
fwdate = datetime.datetime.strptime(str(tdate), '%m/%d/%Y').strftime('%Y%m%d')
fwtime = datetime.datetime.strptime(str(ttime), '%H:%M').strftime('%H%M')
ntime = float(fwtime) + float(100)
closests_ = closest(tempDataList, {'lat':lat_,'lon':lon_})
sensorid = closests_['SENSORID']
usedSensorId = sensorid
selectedWeatherRow = df2.loc[(df2.WBANNO == sensorid) & (df2.LST_DATE == float(fwdate)) & (df2.LST_TIME >= float(fwtime)) & (df2.LST_TIME < ntime) ,['T_CALC', 'P_CALC']]
if len(selectedWeatherRow.index) == 0:
for sensId in sensorIds:
if sensId == sensorid:
continue
selectedWeatherRow = df2.loc[(df2.WBANNO == sensId) & (df2.LST_DATE == float(fwdate)) & (df2.LST_TIME >= float(fwtime)) & (df2.LST_TIME < ntime), ['T_CALC', 'P_CALC']]
if len(selectedWeatherRow.index) == 0:
continue
else:
tcal = selectedWeatherRow['T_CALC'].values[0]
pcal = selectedWeatherRow['P_CALC'].values[0]
usedSensorId = sensId
break
else:
tcal = selectedWeatherRow['T_CALC'].values[0]
pcal = selectedWeatherRow['P_CALC'].values[0]
row['TEMP'] = tcal
row['PREP'] = pcal
outdf.loc[index] = row
print index, tcal, pcal, fwdate, fwtime, ntime, usedSensorId
print "Loop completed"
outdf.to_csv("NYPD_TRAFFIC_DATA.csv")
print "file completed"
This program has been running for days. Not sure why dataframe is too slow. I rewrote the program without dataframe using dictionaries and it completed in a few minutes. Not sure if dataframe is slow or I am not using it correctly. Just posting here for learning.
I have a dataframe that consists of hourly data for a whole year. I want to calculate the monthly means and show them in a time series plot. I have one variable which is NO2 values.
#Cleaning data
ck_2000 = pd.read_csv('2000-CamdenKerbside.csv', header=0,skiprows=4,usecols=range(0,3),skipfooter = 1, na_values = 'No data',engine = 'python')
colnames = ['Date', 'Time', 'NO2']
ck_2000.columns = colnames
#Reformat date/time
ck_2000.Time.replace(to_replace = '24:00:00', value = '00:00:00', inplace = True)
dtw = pd.to_datetime(ck_2000.Date + ck_2000.Time,format='%d/%m/%Y%H:%M:%S')
ck_2000.index = dtw
#Index dataframe by date
firstDate = ck_2000.index[0]
lastDate = ck_2000.index[len(ck_2000.Date) - 1]
ck2000 = ck_2000.reindex(index=pd.date_range(start = firstDate, end =lastDate, freq = '1H'), fill_value= None)
#Change data type to float
ck2000['NO2'] = ck2000['NO2'].dropna().astype('int64')
#Interpolation
ck_2000_int = ck_2000.interpolate()
#df's for all months
ck_2000_jan = ck_2000_int['2000-01']
ck_2000_feb = ck_2000_int['2000-02']
ck_2000_mar = ck_2000_int['2000-03']
ck_2000_apr = ck_2000_int['2000-04']
ck_2000_may = ck_2000_int['2000-05']
ck_2000_jun = ck_2000_int['2000-06']
ck_2000_jul = ck_2000_int['2000-07']
ck_2000_aug = ck_2000_int['2000-08']
ck_2000_sept = ck_2000_int['2000-09']
ck_2000_oct = ck_2000_int['2000-10']
ck_2000_nov = ck_2000_int['2000-11']
ck_2000_dec = ck_2000_int['2000-12']
you should be able to use resample
Consider the following example
tidx = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H')
ck_2000_int = pd.DataFrame(dict(NO2=np.random.randn(len(tidx))), tidx)
ck_2000_int.resample('M').mean().plot()