Specify function - python

I have the following function. It calculates the euclidean distances form some financial figures between companies and give me the closest company. Unfortunately, sometimes the closest company is the same company. Does anyone know how I can adjust the function so that it does not return the same company?
#Calculating the closest distances
records = df_ipos.to_dict('records') #converting dataframe to a list of dictionaries
def return_closest(df,inp_record):
"""returns the closest euclidean distanced record"""
filtered_records = df.to_dict('records')#converting dataframe to a list of dictionaries
for record in filtered_records: #iterating through dictionaries
params = ['z_SA','z_LEV','z_AT', 'z_PM', 'z_RG']#parameters to calculate euclidean distance
distance = []
for param in params:
d1,d2 = record.get(param,0),inp_record.get(param,0) # fetching value of these parameters. default is0 if not found
if d1!=d1: #checking isNan
d1 = 0
if d2!=d2:
d2 = 0
distance.append((d1 - d2)**2)
euclidean = math.sqrt(sum(distance))
record['Euclidean distance'] = round(euclidean,6) #assigning to a new key
distance_records = sorted(filtered_records,key = lambda x:x['Euclidean distance']) #sorting in increasing order
return next(filter(lambda x:x['Euclidean distance'],distance_records),None) #returning the lowest value which is not zero. Default None
for record in records:
ipo_year = record.get('IPO Year')
sic_code = record.get('SIC-Code')
df = df_fundamentals[df_fundamentals['Year']==ipo_year]
df = df[df['SIC-Code']==sic_code] #filtering dataframe
closest_record = return_closest(df,record)
if closest_record:
record['Closest Company'] = closest_record.get('Name') #adding new columns
record['Actual Distance'] = closest_record.get('Euclidean distance')
df_dist = pd.DataFrame(records) #changing list of dictionaries back to dataframe
thanks in advance!

Based on your question, it is not exactly clear to me what your inputs are.
But as a simple fix, I would suggest you check before your function's for loop, whether the record you are comparing is identical to the one which you check against, i.e., add:
...
filtered_records = [rec for rec in filtered_records if rec['Name'] != inp_record['Name']]
for record in filtered_records: #iterating through dictionaries
...
This only applies, if 'Name' really contains the company name. Also for your function not to work, there seems to be an absolute distance greater zero when comparing your parameters. I am not sure if this is intended, maybe you look at data from different years? I cannot really tell, due to the limited amount of information.

Related

Going from code that compares one value to all other values to all values of all other values

I've written the code that will find take a number of n grams, a specific index, and a threshold, and return the values that fall within that threshold. However, currently, it only compares a set of tokens (a given index) to each set of tokens at all other indices. I want to compare each set tokens at all indices to every other set of tokens at all indices. I don't think this is a difficult question, but python is my main language and I struggle with for loops a bit.
So essentially, the variable token in the function should actually iterate over each string in the column, and be compared with comp_token and the index call would be removed, since it would be iterating over all indices.
Let me know if that isn't clear enough and I will think more about how to say this: it is just difficult because the thing I am asking is the thing I am struggling with.
data = ['Time', "NY Times", 'Atlantic']
ph = pd.DataFrame(data, columns=['companies'])
ph.reset_index(inplace=True)
import py_stringmatching as sm
import pandas as pd
import numpy as np
jac = sm.Jaccard()
def predict_label(num, index, thresh):
qg_num_tok = sm.QgramTokenizer(qval = num)
companies = ph.companies.to_list()
ids = ph['index']
companies_qg_num_token = {}
companies_id2index = {}
for i in range(len(companies)):
companies_id2index[i] = companies[i]
companies_qg_num_token[i] = qg_num_tok.tokenize(companies[i])
predicted_label = [1]
token = companies_qg_num_token[index] #index you want: to get all the tokens
for comp_name in ids[1:]:
comp_token = companies_qg_num_token[comp_name]
sim = jac.get_sim_score(token, comp_token)
if sim > thresh:
predicted_label.append(1)
else:
predicted_label.append(0)
#companies_id2index must be equal to token numbner
ph.loc[ph['index'] != companies_id2index[index], 'label'] = 0 #if not equal to index
ph['prediction'] = predicted_label
print(token)
print(companies_id2index[index])
return ph.query('prediction==1')
predict_label(ph, 1, .5)

trying to figure out a pythonic way of code that is taking time even after using list comprehension and pandas

I have two dataframes: one comprising a large data set, allprice_df, with time price series for all stocks; and the other, init_df, comprising selective stocks and trade entry dates. I am trying to find the highest price for each ticker symbol and its associated date.
The following code works but it is time consuming, and I am wondering if there is a better, more Pythonic way to accomplish this.
# Initial call
init_df = init_df.assign(HighestHigh = lambda x:
highestHigh(x['DateIdentified'], x['Ticker'], allprice_df))
# HighestHigh function in lambda call
def highestHigh(date1,ticker,allp_df):
if date1.size == ticker.size:
temp_df = pd.DataFrame(columns = ['DateIdentified','Ticker'])
temp_df['DateIdentified'] = date1
temp_df['Ticker'] = ticker
else:
print("dates and tickers size mismatching")
sys.exit(1)
counter = itertools.count(0)
high_list = [getHigh(x,y,allp_df, next(counter)) for x, y in zip(temp_df['DateIdentified'],temp_df['Ticker'])]
return high_list
# Getting high for each ticker
def getHigh(dateidentified,ticker,allp_df, count):
print("trade %s" % count)
currDate = datetime.datetime.now().date()
allpm_df = allp_df.loc[((allp_df['Ticker']==ticker)&(allp_df['date']>dateidentified)&(allp_df['date']<=currDate)),['high','date']]
hh = allpm_df.iloc[:,0].max()
hd = allpm_df.loc[(allpm_df['high']==hh),'date']
hh = round(hh,2)
h_list = [hh,hd]
return h_list
# Split the list in to 2 columns one with price and the other with the corresponding date
init_df = split_columns(init_df,"HighestHigh")
# The function to split the list elements in to different columns
def split_columns(orig_df,col):
split_df = pd.DataFrame(orig_df[col].tolist(),columns=[col+"Mod", col+"Date"])
split_df[col+"Date"] = split_df[col+"Date"].apply(lambda x: x.squeeze())
orig_df = pd.concat([orig_df,split_df], axis=1)
orig_df = orig_df.drop(col,axis=1)
orig_df = orig_df.rename(columns={col+"Mod": col})
return orig_df
There are a couple of obvious solutions that would help reduce your runtime.
First, in your getHigh function, instead of using loc to get the date associated with the maximum value for high, use idxmax to get the index of the row associated with the high and then access that row:
hh, hd = allpm_df[allpm_df['high'].idxmax()]
This will replace two O(N) operations (finding the maximum in a list, and doing a list lookup using a comparison) with one O(N) operation and one O(1) operation.
Edit
In light of your information on the size of your dataframes, my best guess is that this line is probably where most of your time is being consumed:
allpm_df = allp_df.loc[((allp_df['Ticker']==ticker)&(allp_df['date']>dateidentified)&(allp_df['date']<=currDate)),['high','date']]
In order to make this faster, I would setup your data frame to include a multi-index when you first create the data frame:
index = pd.MultiIndex.from_arrays(arrays = [ticker_symbols, dates], names = ['Symbol', 'Date'])
allp_df = pd.Dataframe(data, index = index)
allp_df.index.sortlevel(level = 0, sort_remaining = True)
This should create a dataframe with a sorted, multi-level index associated with your ticker symbol and date. Doing this will reduce your search time tremendously. Once you do that, you should be able to access all the data associated with a ticker symbol and a given date-range by doing this:
allp_df[ticker, (dateidentified: currDate)]
which should return your data much more quickly. For more information on multi-indexing, check out this helpful Pandas tutorial.

Creating a Dictionary and then a dataframe from different Types

I'm having a problem manipulating different types. Here is what I'm doing:
--
row_list=[]
for x in a:
df=selectinfo(x)
analysis=df[['Sales']].copy()
decompose_result_mult = seasonal_decompose(analysis, model="additive")
date = decompose_result_mult.trend.to_frame()
date.reset_index(inplace=True)
date=date.iloc[:,0]
trend = decompose_result_mult.trend
trend = decompose_result_mult.trend.to_frame()
trend.reset_index(inplace=True)
trend=trend.iloc[:,1]
seasonal = decompose_result_mult.seasonal
seasonal = decompose_result_mult.seasonal.to_frame()
seasonal.reset_index(inplace=True)
seasonal=seasonal.iloc[:,1]
residual = decompose_result_mult.resid
residual = decompose_result_mult.resid.to_frame()
residual.reset_index(inplace=True)
residual=residual.iloc[:,1]
observed=decompose_result_mult.observed
observed=decompose_result_mult.observed.to_frame()
observed.reset_index(inplace=True)
observed=observed.iloc[:,1]
dict= {'Date':date,'region':x,'Trend':trend,'Seasonal':seasonal,'Residual':residual,'Sales':observed}
row_list.append(dict)
pandasTable=pd.DataFrame(row_list)
The problem with this is that when I run my code I get the next result:
It is a dataframe that have inside Series... Any help? I would like to have 1 value per row and not 1 list per row.
Thank you!

Dictionary seems a bit off and missing values when updating value from loop

So I've got a list of areas and a list of targets, what I'm trying to do is just get the closest area to that target and the distance from it using haversine.
The problem I'm having is that when I'm getting the final dictionary it's not got all the records in there, I suspect something is just not being captured properly and that's jumbling the order up but having trouble figuring out where that is.
Here's my code, let me know if it's not very clear
#Main dictionary with results
center_dict = dict()
for index, row in targets.iterrows():
#latitude and longitude columns for the targets
p_lat = row[9]
p_lon = row[10]
#name of the target, to be primary key of dict
name = row[0]
distances = dict()
#loop of the areas to map against
for ind, r in pins.iterrows():
#longitude and latitude data for those areas
r_lat = r[2]
r_lon = r[3]
#calling haversine function that returns a distance in miles
distance = haversine(p_lon, p_lat, r_lon, r_lat)
#adding distance to the distances dict with the location name as a key
distances[r[0]] = distance
#taking minimum value from dict and the key for this
closest = min(distances, key=distances.get)
closest_value = min(distances.values())
#adding the two above to the list as well as a unique id for them which is in the data
center_dict[name] = {row[1], closest, closest_value}

Output unique values from a pandas dataframe without reordering the output

I know that a few posts have been made regarding how to output the unique values of a dataframe without reordering the data.
I have tried many times to implement these methods, however, I believe that the problem relates to how the dataframe in question has been defined.
Basically, I want to look into the dataframe named "C", and output the unique values into a new dataframe named "C1", without changing the order in which they are stored at the moment.
The line that I use currently is:
C1 = pd.DataFrame(np.unique(C))
However, this returns an ascending order list (while, I simply want the list order preserved only with duplicates removed).
Once again, I apologise to the advanced users who will look at my code and shake their heads -- I'm still learning! And, yes, I have tried numerous methods to solve this problem (redefining the C dataframe, converting the output to be a list etc), to no avail unfortunately, so this is my cry for help to the Python gods. I defined both C and C1 as dataframes, as I understand that these are pretty much the best datastructures to house data in, such that they can be recalled and used later, plus it is quite useful to name the columns without affecting the data contained in the dataframe).
Once again, your help would be much appreciated.
F0 = ('08/02/2018','08/02/2018',50)
F1 = ('08/02/2018','09/02/2018',52)
F2 = ('10/02/2018','11/02/2018',46)
F3 = ('12/02/2018','16/02/2018',55)
F4 = ('09/02/2018','28/02/2018',48)
F_mat = [[F0,F1,F2,F3,F4]]
F_test = pd.DataFrame(np.array(F_mat).reshape(5,3),columns=('startdate','enddate','price'))
#convert string dates into DateTime data type
F_test['startdate'] = pd.to_datetime(F_test['startdate'])
F_test['enddate'] = pd.to_datetime(F_test['enddate'])
#convert datetype to be datetime type for columns startdate and enddate
F['startdate'] = pd.to_datetime(F['startdate'])
F['enddate'] = pd.to_datetime(F['enddate'])
#create contract duration column
F['duration'] = (F['enddate'] - F['startdate']).dt.days + 1
#re-order the F matrix by column 'duration', ensure that the bootstrapping
#prioritises the shorter term contracts
F.sort_values(by=['duration'], ascending=[True])
# create prices P
P = pd.DataFrame()
for index, row in F.iterrows():
new_P_row = pd.Series()
for date in pd.date_range(row['startdate'], row['enddate']):
new_P_row[date] = row['price']
P = P.append(new_P_row, ignore_index=True)
P.fillna(0, inplace=True)
#create C matrix, which records the unique day prices across the observation interval
C = pd.DataFrame(np.zeros((1, intNbCalendarDays)))
C.columns = tempDateRange
#create the Repatriation matrix, which records the order in which contracts will be
#stored in the A matrix, which means that once results are generated
#from the linear solver, we know exactly which CalendarDays map to
#which columns in the results array
#this array contains numbers from 1 to NbContracts
R = pd.DataFrame(np.zeros((1, intNbCalendarDays)))
R.columns = tempDateRange
#define a zero filled matrix, P1, which will house the dominant daily prices
P1 = pd.DataFrame(np.zeros((intNbContracts, intNbCalendarDays)))
#rename columns of P1 to be the dates contained in matrix array D
P1.columns = tempDateRange
#create prices in correct rows in P
for i in list(range(0, intNbContracts)):
for j in list(range(0, intNbCalendarDays)):
if (P.iloc[i, j] != 0 and C.iloc[0,j] == 0) :
flUniqueCalendarMarker = P.iloc[i, j]
C.iloc[0,j] = flUniqueCalendarMarker
P1.iloc[i,j] = flUniqueCalendarMarker
R.iloc[0,j] = i
for k in list(range(j+1,intNbCalendarDays)):
if (C.iloc[0,k] == 0 and P.iloc[i,k] != 0):
C.iloc[0,k] = flUniqueCalendarMarker
P1.iloc[i,k] = flUniqueCalendarMarker
R.iloc[0,k] = i
elif (C.iloc[0,j] != 0 and P.iloc[i,j] != 0):
P1.iloc[i,j] = C.iloc[0,j]
#convert C dataframe into C_list, in prepataion for converting C_list
#into a unique, order preserved list
C_list = C.values.tolist()
#create C1 matrix, which records the unique day prices across unique days in the observation period
C1 = pd.DataFrame(np.unique(C))
Use DataFrame.duplicated() to check if your data-frame contains any duplicate or not.
If yes then you can try DataFrame.drop_duplicate() .

Categories