Is python pandas dataframe too slow? - python

I have an interesting problem. I have two files, NYPD_Motor_Collisions.csv has 1.2M lines and weatherfinal.txt has 109K lines. The objective is to merge the temp and prec data from weatherfinal.txt to the Collisions files as two columns based on the latitudes and longitudes. I wrote the following code using dataframe in pandas python.
from math import cos, asin, sqrt
import pandas as pd
import numpy as np
import os
import re
import datetime
def distance(lat1, lon1, lat2, lon2):
p = 0.017453292519943295
a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
return 12742 * asin(sqrt(a))
def closest(data, v):
return min(data, key=lambda p: distance(v['lat'],v['lon'],p['lat'],p['lon']))
tempDataList = []
#v = {'lat': 39.7622290, 'lon': -86.1519750}
#print(closest(tempDataList, v))
print os.getcwd()
filed_ = open("weatherfinal.txt", 'r')
fileo_ = open("weatherfinal_updated.txt","w")
lines_ = filed_.readlines()
for line_ in lines_:
outline = re.sub(" +"," ",line_)
fileo_.write(outline + "\n")
fileo_.close()
df = pd.read_csv("NYPD_Motor_Vehicle_Collisions.csv")
colhead = np.append(df.columns.values,['TEMP', 'PREP'])
outdf = pd.DataFrame(columns=colhead)
df2 = pd.read_csv("weatherfinal_updated.txt",' ')
df2.set_index(['WBANNO', 'LST_DATE', 'LST_TIME'])
sensorIds = df2['WBANNO'].unique()
for ids_ in sensorIds:
longitude = df2.loc[df2['WBANNO']==ids_,'LONGITUDE'].iloc[0]
latitude = df2.loc[df2['WBANNO'] == ids_, 'LATITUDE'].iloc[0]
tempDataList.append({'lat':latitude,'lon':longitude,'SENSORID': ids_ })
print tempDataList
for index, row in df.iterrows():
lon_ = row['LONGITUDE']
lat_ = row['LATITUDE']
tdate = row['DATE']
ttime = row['TIME']
tcal = 5
pcal = 0
fwdate = datetime.datetime.strptime(str(tdate), '%m/%d/%Y').strftime('%Y%m%d')
fwtime = datetime.datetime.strptime(str(ttime), '%H:%M').strftime('%H%M')
ntime = float(fwtime) + float(100)
closests_ = closest(tempDataList, {'lat':lat_,'lon':lon_})
sensorid = closests_['SENSORID']
usedSensorId = sensorid
selectedWeatherRow = df2.loc[(df2.WBANNO == sensorid) & (df2.LST_DATE == float(fwdate)) & (df2.LST_TIME >= float(fwtime)) & (df2.LST_TIME < ntime) ,['T_CALC', 'P_CALC']]
if len(selectedWeatherRow.index) == 0:
for sensId in sensorIds:
if sensId == sensorid:
continue
selectedWeatherRow = df2.loc[(df2.WBANNO == sensId) & (df2.LST_DATE == float(fwdate)) & (df2.LST_TIME >= float(fwtime)) & (df2.LST_TIME < ntime), ['T_CALC', 'P_CALC']]
if len(selectedWeatherRow.index) == 0:
continue
else:
tcal = selectedWeatherRow['T_CALC'].values[0]
pcal = selectedWeatherRow['P_CALC'].values[0]
usedSensorId = sensId
break
else:
tcal = selectedWeatherRow['T_CALC'].values[0]
pcal = selectedWeatherRow['P_CALC'].values[0]
row['TEMP'] = tcal
row['PREP'] = pcal
outdf.loc[index] = row
print index, tcal, pcal, fwdate, fwtime, ntime, usedSensorId
print "Loop completed"
outdf.to_csv("NYPD_TRAFFIC_DATA.csv")
print "file completed"
This program has been running for days. Not sure why dataframe is too slow. I rewrote the program without dataframe using dictionaries and it completed in a few minutes. Not sure if dataframe is slow or I am not using it correctly. Just posting here for learning.

Related

Pivot function results differ from TradingView

I'm using this code to calculate pivot points.
def pivots_low(osc, LBR, LBL):
pivots = []
for i in range(len(osc) - LBR):
pivots.append(0)
pivot = True
if i > LBL:
for j in range(1, LBR + 1):
if osc[i] >= osc[i + j]:
pivot = False
for j in range(1, LBL + 1):
if osc[i] > osc[i - j]:
pivot = False
if pivot is True:
pivots[len(pivots) - 1] = osc[i]
for i in range(LBR):
pivots.append(0)
return pivots
This returns an array with 0's where there's no pivots and the value of the pivot if there is one.
When Comparing the results to TradingView (downloaded csv with pivot points), the only time it matches exactly is when lookback left and right are both 5. Otherwise it deviates in the number of total pivots and the location of some.
But using this code to calculate pivot highs:
def pivots_high(osc, LBR, LBL):
pivots = []
for i in range(len(osc)-LBR):
pivots.append(0)
pivot = True
if i > LBL:
for j in range(1,LBL + 1):
if osc[i] < osc[i-j]:
pivot = False
for j in range(1,LBR + 1):
if osc[i] <= osc[i+j]:
pivot = False
if pivot is True:
pivots[len(pivots)-1] = osc[i]
for i in range(LBR):
pivots.append(0)
return pivots
the results are perfect regardless of lookback values. But the code is almost exactly the same besides comparison.
What is going wrong here? This is day 3 of having this problem and I just cant fix it
To Reproduce:
Load Data:
Full_Data = pd.read_csv(file)
use this simple function to check matches between calculated pivots and TradingView pivots.
def match_pivs(data, pivs_h, pivs_l): //Data is a DataFrame loaded from tradingview csv
global lblh
global lbrh
global lbll
global lbrl
start = lbrh
if lbrl > lbrh:
start = lbrl
match_h = 0
tot_hd = 0
tot_hp = 0
match_l = 0
tot_ld = 0
tot_lp = 0
for i in range(start, len(data)):
if data['PivHigh'][i] != 0 and pivs_h[i-lbrh] != 0:
match_h += 1
if data['PivLow'][i] != 0 and pivs_l[i-lbrl] != 0:
match_l += 1
if data['PivHigh'][i] != 0:
tot_hd += 1
if data['PivLow'][i] != 0:
tot_ld += 1
if pivs_h[i] != 0:
tot_hp += 1
if pivs_l[i] != 0:
tot_lp += 1
print('PivsLow ' + str(tot_lp))
print('DataLows ' + str(tot_ld))
print('MatchesL ' + str(match_l))
print('PivsHigh ' + str(tot_hp))
print('DataHighs ' + str(tot_hd))
print('MatchesH ' + str(match_h))
and to get csv from TradingView:
//#version=5
indicator("Data Script", overlay=true, max_labels_count=500)
leftLenL = input.int(title="Pivot Low", defval=10, minval=1, inline="Pivot Low", group=lengthGroupTitle)
rightLenL = input.int(title="/", defval=10, minval=1, inline="Pivot Low", group=lengthGroupTitle)
leftLenH = input.int(title="Pivot High", defval=10, minval=1, inline="Pivot High", group=lengthGroupTitle)
rightLenH = input.int(title="/", defval=10, minval=1, inline="Pivot High", group=lengthGroupTitle)
ph = ta.pivothigh(leftLenH, rightLenH)
pl = ta.pivotlow(leftLenL, rightLenL)
if not na(ph)
plth := ph
else
plth := 0.0
if not na(pl)
pltl := pl
else
pltl := 0.0
plot(plth, 'PivHigh')
plot(pltl, 'PivLow')
then just download csv with this script loaded.
Run program with these three lines:
pl = pivots_low(Full_Data['low'], lbll, lbrl)
ph = pivots_high(Full_Data['high'], lbrh, lblh)
match_pivs(Full_Data, ph, pl)
Finally found a way.
I still have no idea why that code does not work but I've made a different way that seems to be doing the job 100% to tradingview data.
def checkhl(data_back, data_forward, hl):
if hl == 'high' or hl == 'High':
ref = data_back[len(data_back)-1]
for i in range(len(data_back)-1):
if ref < data_back[i]:
return 0
for i in range(len(data_forward)):
if ref <= data_forward[i]:
return 0
return 1
if hl == 'low' or hl == 'Low':
ref = data_back[len(data_back)-1]
for i in range(len(data_back)-1):
if ref > data_back[i]:
return 0
for i in range(len(data_forward)):
if ref >= data_forward[i]:
return 0
return 1
def pivot(osc, LBL, LBR, highlow)
left = []
right = []
for i in range(len(osc)):
pivots.append(0.0)
if i < LBL + 1:
left.append(osc[i])
if i > LBL:
right.append(osc[i])
if i > LBL + LBR:
left.append(right[0])
left.pop(0)
right.pop(0)
if checkhl(left, right, highlow):
pivots[i - LBR] = osc[i - LBR]
return pivots
then just do:
pivots_low = pivot(data, lbl, lbr, 'low')
pivots_high = pivot(data, lbl, lbr, 'high')
All the pivots will be in the actual position that they occur, not lbr bars after, otherwise the value will be 0.0
I'm not sure if this is efficient or not but it seems to work.

How can i create an input for choosing different files to access?

I am quite new to python so please bear with me.
Currently, this is my code:
import pandas as pd
import statistics
import matplotlib.pyplot as plt
import math
from datetime import datetime
start_time = datetime.now()
gf = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/exfileCLEAN2.csv",
skiprows=[1])
bf = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/2SeconddatasetCLEAN.csv",
skiprows=[1])
df = (input("Which data set? "))
magnitudes = (df['Magnitude '].values)
times = df['Time '].values
average = statistics.mean(magnitudes)
sd = statistics.stdev(magnitudes)
below = sd * 3
class data_set:
def __init__(self, index):
self.mags = []
self.i = index
self.mid_time = df['Time '][index]
self.mid_mag = df['Magnitude '][index]
self.times = []
ran = 80
for ii in range(ran):
self.times.append(df['Time '][self.i + ii - ran / 2])
self.mags.append(df['Magnitude '][self.i + ii - ran / 2])
data = []
today = float(input("What is the range? "))
i = 0
while (i < len(df['Magnitude '])):
if (abs(df['Magnitude '][i]) <= (average - below)):
# check if neighbours
t = df['Time '][i]
tt = True
for d in range(len(data)):
if abs(t - data[d].mid_time) <= today:
# check if closer to center
if df['Magnitude '][i] < data[d].mid_mag:
data[d] = data_set(i)
print("here")
tt = False
break
if tt:
data.append(data_set(i))
i += 1
print("found values")
# graphing
height = 2 # Change this for number of columns
width = math.ceil(len(data) / height)
if width < 2:
width = 2
fig, axes = plt.subplots(width, height, figsize=(30, 30))
row = 0
col = 0
for i in range(len(data)):
axes[row][col].plot(data[i].times, data[i].mags)
col += 1
if col > height - 1:
col = 0
row += 1
plt.show()
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
Currently, the error produced is this:
/Users/aaronhuang/.conda/envs/EXTTEst/bin/python "/Users/aaronhuang/PycharmProjects/EXTTEst/Code sandbox.py"
Which data set? gf
Traceback (most recent call last):
File "/Users/aaronhuang/PycharmProjects/EXTTEst/Code sandbox.py", line 14, in <module>
magnitudes = int(df['Magnitude '].values)
TypeError: string indices must be integers
Process finished with exit code 1
I am trying to have the user be able to choose which file to access to perform the rest of the code on.
So if the user types gf I would like the code to access the first data file.
Any help would be appreciated. Thank you
Why not use an if-statement at the beginning? Try this:
instead of:
gf = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/exfileCLEAN2.csv",
skiprows=[1])
bf = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/2SeconddatasetCLEAN.csv",
skiprows=[1])
df = (input("Which data set? "))
Use this:
choice = input("Which data set? ")
if choice == "gf":
df = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/exfileCLEAN2.csv",
skiprows=[1])
elif choice == "bf":
df = pd.read_csv(r"/Users/aaronhuang/Documents/Desktop/ffp/2SeconddatasetCLEAN.csv",
skiprows=[1])
else:
print("Error. Your choice is not valid")
df = ""
break

Is there a function to add WOE, calculated on Training data, to the whole data set? (python)

I am working on some python code to predict Default rate of loans handed out by a bank.
I have calculated the WOE and information value (IV) on the training set
(using the following code: https://github.com/Sundar0989/WOE-and-IV/blob/master/WOE_IV.ipynb?fbclid=IwAR1MvEfyGsdyTre0uPJC5WRl91dfue_t0vH5qJezwm2mAg6sjHZJg9MyDYo).
We have also concluded 2 high cardinality variables. We don't know however how to add these WOE scores to the whole set. How do we tackle this problem? How can we go further to use WOE to predict the target variable?
code:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy, pylab
Reading the data received from bank, feature selection part 1, splitting up whole set (Training) into training set: indices_traintrain, validation set: indices_val and test set: indices_test (70/30 split training and validation set - test set and 70/30 split training - validation)
Training =
pd.read_excel('/Users/enjo/Documents/Master/DM/Data_DSC2019_STUDENTS/DSC2019_Training.xlsx', na_values=np.nan)
Status = Training.iloc[:,-1]
Data = Training.iloc[:,0:45]
Data_missing = Data.isna()
Data_missing = Data_missing.sum()
print(Data_missing/len(Data))
"""
drop variables with more than 80% missing
"""
Drop = ['FREE_CASH_FLOW_AMT',
'A2_MTHS_FIRST_PCX_COREPROF_CNT', 'A2_MONTHS_IN_BELGIUM_CNT', 'A2_MTHS_SNC_FIRST_COREPROF_CNT', 'MONTHS_SINCE_LAST_REFUSAL_CNT']
DroppedTraining = Training.copy()
for element in Drop:
DroppedTraining.drop(element, axis=1,inplace=True)
import numpy as np
from sklearn import datasets
from sklearn import svm
from sklearn import preprocessing
Data_preprocessed=[] #contains preprocessed data
from Preprocessing_continuous import Preprocessing_continuous #import function for preprocessing
from Preprocessing_discrete import Preprocessing_discrete #import function for preprocessing
from sklearn.model_selection import train_test_split
indices=np.arange(26962)
indices_train, indices_test = train_test_split(indices, test_size=0.3, random_state=0)
indices_traintrain, indices_val = train_test_split(indices_train, test_size=0.3, random_state=0)
Training['target']= Training['Label_Default'].apply(lambda x:1 if x=='Y' else 0)
Highcardinalityset=[]
Highcardinalityset = Training[['Type',
'INDUSTRY_CD_3',
'INDUSTRY_CD_4',
'Managing_Sales_Office_Nbr',
'Postal_Code_L',
'Product_Desc',
'CREDIT_TYPE_CD',
'ACCOUNT_PURPOSE_CD',
'A2_MARITAL_STATUS_CD',
'FINANCIAL_PRODUCT_TYPE_CD',
'A2_EMPLOYMENT_STATUS_CD',
'A2_RESIDENT_STATUS_CD',
'target']]
Highcardinalityset = Highcardinalityset.iloc[indices_traintrain]
function found on github
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string
max_bin = 20
force_bin = 3
# define a binning function
def mono_bin(Y, X, n = max_bin):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
r = 0
while np.abs(r) < 1:
try:
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
d2 = d1.groupby('Bucket', as_index=True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
except Exception as e:
n = n - 1
if len(d2) == 1:
n = force_bin
bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
if len(np.unique(bins)) == 2:
bins = np.insert(bins, 0, 1)
bins[1] = bins[1]-(bins[1]/2)
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)})
d2 = d1.groupby('Bucket', as_index=True)
d3 = pd.DataFrame({},index=[])
d3["MIN_VALUE"] = d2.min().X
d3["MAX_VALUE"] = d2.max().X
d3["COUNT"] = d2.count().Y
d3["EVENT"] = d2.sum().Y
d3["NONEVENT"] = d2.count().Y - d2.sum().Y
d3=d3.reset_index(drop=True)
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
return(d3)
def char_bin(Y, X):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
df2 = notmiss.groupby('X',as_index=True)
d3 = pd.DataFrame({},index=[])
d3["COUNT"] = df2.count().Y
d3["MIN_VALUE"] = df2.sum().Y.index
d3["MAX_VALUE"] = d3["MIN_VALUE"]
d3["EVENT"] = df2.sum().Y
d3["NONEVENT"] = df2.count().Y - df2.sum().Y
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
d3 = d3.reset_index(drop=True)
return(d3)
def data_vars(df1, target):
stack = traceback.extract_stack()
filename, lineno, function_name, code = stack[-2]
vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
final = (re.findall(r"[\w']+", vars_name))[-1]
x = df1.dtypes.index
count = -1
for i in x:
if i.upper() not in (final.upper()):
if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
conv = mono_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
else:
conv = char_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
if count == 0:
iv_df = conv
else:
iv_df = iv_df.append(conv,ignore_index=True)
iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
iv = iv.reset_index()
return(iv_df,iv)
final_iv, IV = data_vars(Highcardinalityset,Highcardinalityset.target)
final_iv
IV.sort_values('IV')
IV.to_csv('test.csv')
transform_vars_list = Highcardinalityset.columns.difference(['target'])
transform_prefix = 'new_' # leave this value blank if you need replace the original column values
transform_vars_list
for var in transform_vars_list:
small_df = final_iv[final_iv['VAR_NAME'] == var]
transform_dict = dict(zip(small_df.MAX_VALUE.astype(str),small_df.WOE.astype(str)))
replace_cmd = ''
replace_cmd1 = ''
for i in sorted(transform_dict.items()):
replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
replace_cmd = replace_cmd + '0'
replace_cmd1 = replace_cmd1 + '0'
if replace_cmd != '0':
try:
Highcardinalityset[transform_prefix + var] = Highcardinalityset[var].apply(lambda x: eval(replace_cmd))
except:
Highcardinalityset[transform_prefix + var] = Highcardinalityset[var].apply(lambda x: eval(replace_cmd1))
Highcardinalityset['Postal_Code_L'].value_counts()
Highcardinalityset['new_Postal_Code_L'].value_counts()
Highcardinalityset['Managing_Sales_Office_Nbr'].value_counts()
Highcardinalityset['new_Managing_Sales_Office_Nbr'].value_counts()
Nice to see when high WOE: interesting for that postal code: high risk for default!
Highcardinalityset.to_excel("Highcardinalitysettraintrain.xlsx")
TrainingWOE = DroppedTraining[['Managing_Sales_Office_Nbr', "Postal_Code_L"]]
TrainingWOE["Postal_Code_L_WOE"]=Highcardinalityset[["new_Postal_Code_L"]]
TrainingWOE["Managing_Sales_Office_Nbr_WOE"]=Highcardinalityset[["new_Managing_Sales_Office_Nbr"]]
drop variables that are not relevant because of low IV value
Drop = ["ACCOUNT_PURPOSE_CD", "A2_MARITAL_STATUS_CD", "A2_EMPLOYMENT_STATUS_CD", "A2_RESIDENT_STATUS_CD",
"INDUSTRY_CD_3", "INDUSTRY_CD_4","Type"]
DroppedTrainingAfterIVcalc = DroppedTraining.copy()
for element in Drop:
DroppedTrainingAfterIVcalc.drop(element, axis=1,inplace=True)
preprocess remaining (44-5 (because of too many missing) - 7 (because of low iv) + 1 (target variable added))
Thanks for asking this question. Here is the code to do the required transformation which is shown in the notebook as well.
transform_vars_list = df.columns.difference(['target'])
transform_prefix = 'new_' # leave this value blank to replace the original column
#apply transformations
for var in transform_vars_list:
small_df = final_iv[final_iv['VAR_NAME'] == var]
transform_dict = dict(zip(small_df.MAX_VALUE,small_df.WOE))
replace_cmd = ''
replace_cmd1 = ''
for i in sorted(transform_dict.items()):
replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
replace_cmd = replace_cmd + '0'
replace_cmd1 = replace_cmd1 + '0'
if replace_cmd != '0':
try:
df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd))
except:
df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd1))
In addition, there is a package Xverse which does the same. Please refer to it here - https://github.com/Sundar0989/XuniVerse

Optimizing a complex algorithm

I know this is not an ideal place for questions of this scope, but I'm not sure where else to ask this or how to break it down. I've been working on a function for the past couple weeks, that runs, but for it to be feasible for my purposes, I need to speed it up 200-300x.
I have an image array, where all pixels of similar color have been averaged and set to that average value. Then I have a 2D array of the same height and width, which labels each unique and non-contiguous feature of the image.
Using these I need to assess the size of each feature and its level of contrast to each of its neighbors. These values are used in an equation and if the output of that equation is below a certain threshold, that feature is merged with its most similar neighbor.
I've uploaded the image and the feature label array (printed with numpy.savetext()) to OneDrive and attached links
code:
def textureRemover(pix, labeledPix, ratio = 1.0):
numElements = numpy.amax(labeledPix)
maxSize = numpy.count_nonzero(labeledPix)
MAXIMUMCONTRAST = 443.405
for regionID in range(numElements):
start = time.clock()
regionID += 1
if regionID not in labeledPix:
continue
#print(regionID)
#print((regionID / numElements) * 100, '%')
neighborIDs = getNeighbors(labeledPix, regionID)
if 0 in neighborIDs:
neighborIDs.remove(0) #remove white value
regionMask = labeledPix == regionID
region = pix[regionMask]
size = numpy.count_nonzero(regionMask)
contrastMin = (ratio - (size / maxSize)) * MAXIMUMCONTRAST
regionMean = region.mean(axis = 0)
if len(neighborIDs) > 200:
contrast = numpy.zeros(labeledPix.shape)
contrast[labeledPix!=0] = numpy.sqrt(numpy.sum((regionMean - pix[labeledPix!=0])**2, axis = -1))
significantMask = (contrast < contrastMin)
significantContrasts = list(numpy.unique(contrast[significantMask]))
significantNeighbors = {}
for significantContrast in significantContrasts:
minContrast = min(significantContrasts)
if labeledPix[contrast == minContrast][0] in neighborIDs:
significantNeighbors[minContrast] = labeledPix[contrast == minContrast][0]
else:
significantContrasts.pop(significantContrasts.index(minContrast))
else:
significantNeighbors = {}
for neighborID in neighborIDs:
neighborMask = labeledPix == neighborID
neighbor = pix[neighborMask]
neighborMean = neighbor.mean(axis = 0)
contrast = numpy.sqrt(numpy.sum((regionMean - neighborMean)**2, axis = -1))
if contrast < contrastMin:
significantNeighbors[contrast] = neighborID
if significantNeighbors:
contrasts = significantNeighbors.keys()
minContrast = min(contrasts)
minNeighbor = significantNeighbors[minContrast]
neighborMask = labeledPix == minNeighbor
neighborSize = numpy.count_nonzero(neighborMask)
if neighborSize <= size:
labeledPix[neighborMask] = regionID
pix[neighborMask] = regionMean
else:
labeledPix[regionMask] = minNeighbor
pix[regionMask] = pix[neighborMask].mean(axis = 0)
print(time.clock() - start)
return pix
pix
labeledPix
I know I'm asking for a lot of help, but I've been stuck on this for a few weeks and am unsure what else I can do. Any help will be greatly appreciated!
Here is an optimized version of most of your logic (I underestimated the amount of work that would be...). I skipped the >200 branch and am using fake data because I couldn't access your link. When I switch off your >200 branch your and my code appear to give the same result but mine is quite a bit faster on the fake example.
Sample output:
original
26.056154000000003
optimized
0.763613000000003
equal
True
Code:
import numpy as np
from numpy.lib.stride_tricks import as_strided
def mockdata(m, n, k):
colors = np.random.random((m, n, 3))
i, j = np.ogrid[:m, :n]
labels = np.round(k*k * (np.sin(0.05 * i) + np.sin(0.05 * j)**2)).astype(int) % k
return colors, labels
DIAG_NEIGHBORS = True
MAXIMUMCONTRAST = 443.405
def textureRemover2(pix, labeledPix, ratio=1.0):
start = time.clock()
pix, labeledPix = pix.copy(), labeledPix.copy()
pixf, labeledPixf = pix.reshape(-1, 3), labeledPix.ravel()
m, n = labeledPix.shape
s, t = labeledPix.strides
# find all sizes in O(n)
sizes = np.bincount(labeledPixf)
n_ids = len(sizes)
# make index for quick access to labeled areas
lblidx = np.split(np.argsort(labeledPixf), np.cumsum(sizes[:-1]))
lblidx[0] = None
# find all mean colors in O(n)
regionMeans = np.transpose([np.bincount(labeledPix.ravel(), px)
/ np.maximum(sizes, 1)
for px in pix.reshape(-1, 3).T])
# find all neighbors in O(n)
horz = set(frozenset(p) for bl in as_strided(labeledPix, (m,n-1,2), (s,t,t))
for p in bl)
vert = set(frozenset(p) for bl in as_strided(labeledPix, (m-1,n,2), (s,t,s))
for p in bl)
nb = horz|vert
if DIAG_NEIGHBORS:
dwnrgt = set(frozenset(p) for bl in as_strided(
labeledPix, (m-1,n-1,2), (s,t,s+t)) for p in bl)
dwnlft = set(frozenset(p) for bl in as_strided(
labeledPix[::-1], (m-1,n-1,2), (-s,t,t-s)) for p in bl)
nb = nb|dwnrgt|dwnlft
nb = {p for p in nb if len(p) == 2 and not 0 in p}
nb_dict = {}
for a, b in nb:
nb_dict.setdefault(a, set()).add(b)
nb_dict.setdefault(b, set()).add(a)
maxSize = labeledPix.size - sizes[0]
for id_ in range(1, n_ids):
nbs = list(nb_dict.get(id_, set()))
if not nbs:
continue
d = regionMeans[id_] - regionMeans[nbs]
d = np.einsum('ij,ij->i', d, d)
mnd = np.argmin(d)
if d[mnd] < ((ratio - sizes[id_]/maxSize) * MAXIMUMCONTRAST)**2:
mn = nbs[mnd]
lrg, sml = (id_, mn) if sizes[id_] >= sizes[mn] else (mn, id_)
sizes[lrg], sizes[sml] = sizes[lrg] + sizes[sml], 0
for nb in nb_dict[sml]:
nb_dict[nb].remove(sml)
nb_dict[nb].add(lrg)
nb_dict[lrg].update(nb_dict[sml])
nb_dict[lrg].remove(lrg)
nb_dict[sml] = set()
pixf[lblidx[sml]] = regionMeans[lrg]
labeledPixf[lblidx[sml]] = lrg
lblidx[lrg], lblidx[sml] = np.r_[lblidx[lrg],lblidx[sml]], None
print(time.clock() - start)
return pix
from scipy.ndimage.morphology import binary_dilation
import time
STRUCTEL = np.ones((3,3), int) if DIAG_NEIGHBORS else np.array([[0,1,0],[1,1,1],[0,1,0]], int)
def getNeighbors(labeledPix, regionID):
nb = set(labeledPix[binary_dilation(labeledPix == regionID, structure=STRUCTEL)])
nb.remove(regionID)
return sorted(nb)
numpy = np
def textureRemover(pix, labeledPix, ratio = 1.0):
pix, labeledPix = pix.copy(), labeledPix.copy()
numElements = numpy.amax(labeledPix)
maxSize = numpy.count_nonzero(labeledPix)
MAXIMUMCONTRAST = 443.405
start = time.clock()
for regionID in range(numElements):
regionID += 1
if regionID not in labeledPix:
continue
#print(regionID)
#print((regionID / numElements) * 100, '%')
neighborIDs = getNeighbors(labeledPix, regionID)
if 0 in neighborIDs:
neighborIDs.remove(0) #remove white value
regionMask = labeledPix == regionID
region = pix[regionMask]
size = numpy.count_nonzero(regionMask)
contrastMin = (ratio - (size / maxSize)) * MAXIMUMCONTRAST
regionMean = region.mean(axis = 0)
if len(neighborIDs) > 20000:
contrast = numpy.zeros(labeledPix.shape)
contrast[labeledPix!=0] = numpy.sqrt(numpy.sum((regionMean - pix[labeledPix!=0])**2, axis = -1))
significantMask = (contrast < contrastMin)
significantContrasts = list(numpy.unique(contrast[significantMask]))
significantNeighbors = {}
for significantContrast in significantContrasts:
minContrast = min(significantContrasts)
if labeledPix[contrast == minContrast][0] in neighborIDs:
significantNeighbors[minContrast] = labeledPix[contrast == minContrast][0]
else:
significantContrasts.pop(significantContrasts.index(minContrast))
else:
significantNeighbors = {}
for neighborID in neighborIDs:
neighborMask = labeledPix == neighborID
neighbor = pix[neighborMask]
neighborMean = neighbor.mean(axis = 0)
contrast = numpy.sqrt(numpy.sum((regionMean - neighborMean)**2, axis = -1))
if contrast < contrastMin:
significantNeighbors[contrast] = neighborID
if significantNeighbors:
contrasts = significantNeighbors.keys()
minContrast = min(contrasts)
minNeighbor = significantNeighbors[minContrast]
neighborMask = labeledPix == minNeighbor
neighborSize = numpy.count_nonzero(neighborMask)
if neighborSize <= size:
labeledPix[neighborMask] = regionID
pix[neighborMask] = regionMean
else:
labeledPix[regionMask] = minNeighbor
pix[regionMask] = pix[neighborMask].mean(axis = 0)
print(time.clock() - start)
return pix
data = mockdata(200, 200, 1000)
print('original')
res0 = textureRemover(*data)
print('optimized')
res2 = textureRemover2(*data)
print('equal')
print(np.allclose(res0, res2))

printing out dictionary values after filtering through it

I need to be able to print out the correct zipcodes that meet the thresh hold criteria, i can filter through them and operate on them, but the last step is to print which zipcodes are within 50miles of the center one.
here is my code
import sys
import csv
import math
dicts = {}
origin =[]
#methods to convert to radians
def getLatRad(latitude):
return float(latitude) * (math.pi/180.0)
def getLongRad(longitude):
return float(longitude) * (math.pi/180.0)
#method to find which zipcodes are within thresh
def getnearbylist(center, thresh, ziplist):
try:
f = open("zips.csv")
csvParser = csv.reader(f)
for row in csvParser:
zipcode= row[0].strip()
latitude= row[2].replace('"', '').strip()
longitude=row[3].replace('"', '').strip()
dicts[zipcode] = {'zipcode':zipcode,'latitude': latitude, 'longitude':longitude}
if center in dicts:
origin=dicts[center]
longRad2= getLongRad(origin['longitude'])
latRad2= getLatRad(origin['latitude'])
matched = {match: dicts[match] for match in ziplist if match in dicts}
for x in matched:
longRad1= getLongRad(matched[x]['longitude'])
latRad1= getLatRad(matched[x]['latitude'])
dlon = longRad2 - longRad1
dlat = latRad2 - latRad1
a = math.sin(dlat/2)**2 + math.cos(latRad1) * math.cos(latRad2) * math.sin(dlon/2)**2
c = 2 * math.asin(math.sqrt(a))
m = 3960 * c
if m <thresh: # cant figure out how to return zipcodes instead of m value
print m
except ValueError:
pass
def main():
center = '12601' # Our center zipcode
thresh = 50 # We are looking for zipcodes within 50 miles
ziplist = ['12481', '10001', '12203', '10303', '12561'] # Our test list
nearbylist = getnearbylist(center, thresh, ziplist) # Call the function
print nearbylist
if __name__ == '__main__':
main()
so instead of printing m, i want to return the zipcodes
thanks!
You need to capture the zips found to be nearby.
#capture the nearby zips in a new dict
near_zips={}
for x in matched:
longRad1= getLongRad(matched[x]['longitude'])
latRad1= getLatRad(matched[x]['latitude'])
dlon = longRad2 - longRad1
dlat = latRad2 - latRad1
a = math.sin(dlat/2)**2 + math.cos(latRad1) * math.cos(latRad2) * math.sin(dlon/2)**2
c = 2 * math.asin(math.sqrt(a))
m = 3960 * c
if m <thresh: # cant figure out how to return zipcodes instead of m value
#add the nearby zipcodes to the dict
print '%f < %f' % (m,thresh)
print 'adding %s to near_zips' % (x,)
near_zips[x] = matched[x]
#return the nearby zips
return near_zips

Categories