Resolving TypeError in Python code - python

I'm basically running some code as follows. Basically I'm just retrieving pairs of stocks (laid out as Row 1-Stock 1,2, Row 2-Stock 1,2 and so on, where Stock 1 and 2 are different in each row) from a CSV File. I then take in data from Yahoo associated with these "Pairs" of Stocks. I calculate the returns of the stocks and basically check if the distance (difference in returns) between a pair of stocks breaches some threshold and if so I return 1. However, I'm getting the following error which I am unable to resolve:
PricePort(tickers)
27 for ticker in tickers:
28 #print ticker
---> 29 x = pd.read_csv('http://chart.yahoo.com/table.csv?s=ttt'.replace('ttt',ticker),usecols=[0,6],index_col=0)
30 x.columns=[ticker]
31 final=pd.merge(final,x,left_index=True,right_index=True)
TypeError: expected a character buffer object
The code is as follows:
from datetime import datetime
import pytz
import csv
import pandas as pd
import pandas.io.data as web
import numpy as np
#Retrieves pairs of stocks (laid out as Row 1-Stock 1,2, Row 2-Stock 1,2 and so on, where Stock 1 and 2 are different in each row) from CSV File
def Dataretriever():
Pairs = []
f1=open('C:\Users\Pythoncode\Pairs.csv') #Enter the location of the file
csvdata= csv.reader(f1)
for row in csvdata: #reading tickers from the csv file
Pairs.append(row)
return Pairs
tickers = Dataretriever() #Obtaining the data
#Taking in data from Yahoo associated with these "Pairs" of Stocks
def PricePort(tickers):
"""
Returns historical adjusted prices of a portfolio of stocks.
tickers=pairs
"""
final=pd.read_csv('http://chart.yahoo.com/table.csv?s=^GSPC',usecols=[0,6],index_col=0)
final.columns=['^GSPC']
for ticker in tickers:
#print ticker
x = pd.read_csv('http://chart.yahoo.com/table.csv?s=ttt'.replace('ttt',ticker),usecols=[0,6],index_col=0)
x.columns=[ticker]
final=pd.merge(final,x,left_index=True,right_index=True)
return final
#Calculating returns of the stocks
def Returns(tickers):
l = []
begdate=(2014,1,1)
enddate=(2014,6,1)
p = PricePort(tickers)
ret = (p.close[1:] - p.close[:-1])/p.close[1:]
l.append(ret)
return l
#Basically a class to see if the distance (difference in returns) between a
#pair of stocks breaches some threshold
class ThresholdClass():
#constructor
def __init__(self, Pairs):
self.Pairs = Pairs
#Calculating the distance (difference in returns) between a pair of stocks
def Distancefunc(self, tickers):
k = 0
l = Returns(tickers)
summation=[[0 for x in range (k)]for x in range (k)] #2d matrix for the squared distance
for i in range (k):
for j in range (i+1,k): # it will be a upper triangular matrix
for p in range (len(self.PricePort(tickers))-1):
summation[i][j]= summation[i][j] + (l[i][p] - l[j][p])**2 #calculating distance
for i in range (k): #setting the lower half of the matrix 1 (if we see 1 in the answer we will set a higher limit but typically the distance squared is less than 1)
for j in range (i+1):
sum[i][j]=1
return sum
#This function is used in determining the threshold distance
def MeanofPairs(self, tickers):
sum = self.Distancefunc(tickers)
mean = np.mean(sum)
return mean
#This function is used in determining the threshold distance
def StandardDeviation(self, tickers):
sum = self.Distancefunc(tickers)
standard_dev = np.std(sum)
return standard_dev
def ThresholdandnewsChecker(self, tickers):
threshold = self.MeanofPairs(tickers) + 2*self.StandardDeviation(tickers)
if (self.Distancefunc(tickers) > threshold):
return 1
Threshold_Class = ThresholdClass(tickers)
Threshold_Class.ThresholdandnewsChecker(tickers,1)

The trouble is Dataretriever() returns a list, not a string. When you iterate over tickers(), the name ticker is bound to a list.
The str.replace method expects both arguments to be strings. The following code raises the error because the second argument is a list:
'http://chart.yahoo.com/table.csv?s=ttt'.replace('ttt', ticker)
The subsequent line x.columns = [ticker] will cause similar problems. Here, ticker needs to be a hashable object (like a string or integer), but lists are not hashable.

Related

How to create pandas columns bases on function iteratively

I have a pandas dataframe with the following relevant features:
location (city), lat, long.
I want to add to this pandas dataframe the following columns in the most efficient manner:
Distance to closest hospital
Distance to closest train station
Etc
My dataframe looks like this:
Column1 postal_code city_name type_of_property price number_of_rooms house_area fully_equipped_kitchen open_fire terrace garden surface_of_the_land number_of_facades swimming_pool state_of_the_building lattitude longitude province region
13380 13380 1785 Brussegem 1 235000 1 71 1 0 1 0 0 4 0 good 4.265887 50.927771 Brabant flamand Flandre
21135 21135 8630 Bulskamp 1 545000 3 191 1 0 0 0 0 2 0 as new 2.643449 51.044461 Flandre-Occidentale Flandre
5002 5002 4287 Lincent 0 97500 3 90 0 0 0 0 260 4 0 to renovate 5.031817 50.711613 Liège Wallonie
9544 9544 8400 Oostende 0 130000 3 119 1 0 0 1 71 2 0 to renovate 2.920327 51.230318 Flandre-Occidentale Flandre
38923 38923 4830 Limbourg 0 149000 2 140 1 0 0 0 15 2 0 to be done up 5.940299 50.612276 Liège Walloni
I found this python package which allows me to find places near by:
https://github.com/slimkrazy/python-google-places
So I created 2 function:
To calculate distance between 2 points (geodesic distance)
import geopy
from googleplaces import GooglePlaces, types, lang, ranking
import geopy.distance
def geodesicdistance(start,end):
return print(geopy.distance.geodesic(start, end).km)
Function to get nearby places to a start point (hospitals, train stations, etc) It all depends on the type parameter
def getplace(start, location, type):
YOUR_API_KEY = ''
google_places = GooglePlaces(YOUR_API_KEY)
query_result = google_places.nearby_search(
lat_lng=start,
location=location,
radius=500,
types=[type],
rankby=ranking.DISTANCE
)
return (query_result.places[0].geo_location['lng'], query_result.places[0].geo_location['lat'])
the type is an enum with the following relevant values for me:
TYPE_BUS_STATION, TYPE_HOSPITAL, TYPE_AIRPORT, TYPE_BAKERY,TYPE_CITY_HALL, TYPE_FIRE_STATION, TYPE_PHARMACY, TYPE_POLICE,TYPE_SUBWAY_STATION,TYPE_TRAIN_STATION,TYPE_UNIVERSITY
11 types.
Basically I want to create 11 new dynamic columns as:
Distance to closes bus station, Distance to closes hospital, Distance to closest airport, etc, etc.
The part I am missing and I dont have a clue how to do it, its how to iterate on the pandas dataframe and per each row, call the functions 11 times and store it as a new value on the corresponding column
Update:
I tried the code below but still got an error. (I changed location by city_name)
import geopy
from googleplaces import GooglePlaces, types, lang, ranking
import geopy.distance
# Corrected geodesicdistance so it returns correct result rather than None
def geodesicdistance(start,end):
result = geopy.distance.geodesic(start, end).km
#print(result) # uncommment to print
return result
# Refactored getplace to getplaces to provide distance to items in type_list
def getplaces(start,
location,
types):
'''
Get distance to nearby item based for types
'''
YOUR_API_KEY = ''
google_places = GooglePlaces('')
# Compute the closest distance for each type
start_lat_lng = (start['lat'], start['lng'])
distances = []
for type_ in types:
# Find closest place for type_
query_result = google_places.nearby_search(
lat_lng = start, # A dict containing the following keys: lat, lng (default None)
location = location, # A human readable location, e.g 'London, England' (default None)
radius = 500,
types = [type_],
rankby = ranking.DISTANCE
)
# 0th element hast lat/lng for closest since we ranked by distance
end_lat_lng = query_result.places[0].geo_location['lat'], query_result.places[0].geo_location['lng']
# Add distance to closest to list by type
distances.append(geodesicdistance(start_lat_lng , end_lat_lng ))
return distances
def min_dist(lattitude, longitude, location, types):
'''
Get distances for places closest to latitude/longitude
latitude/longitude - position to find nearby items from
location - human-readable name of the position
types - list of places to find by type
'''
start = {'lat': lattitude,
'lng': longitude}
return getplaces(start, location, types)
# List of 11 types used from googleplaces.types
types_list = [types.TYPE_BUS_STATION, types.TYPE_HOSPITAL, types.TYPE_AIRPORT, types.TYPE_BAKERY,
types.TYPE_CITY_HALL, types.TYPE_FIRE_STATION, types.TYPE_PHARMACY,
types.TYPE_POLICE,types.TYPE_SUBWAY_STATION, types.TYPE_TRAIN_STATION, types.TYPE_UNIVERSITY]
# Create Dataframe whose columns have the distances to closest item by type
closest_df = df.apply(lambda row: pd.Series(min_dist(row['lattitude'],
row['longitude'],
row['city_name'],
types_list),
index = types_list),
axis = 'columns',
result_type ='expand')
# Concatenate the two dfs columnwise
final_df = pd.concat([df, closest_df], axis = 1)
but I have this error:
IndexError Traceback (most recent call last)
/home/azureuser/cloudfiles/code/Users/levm38/LightGBM/Tests/featureengineering.ipynb Cell 2 in <cell line: 63>()
58 types_list = [types.TYPE_BUS_STATION, types.TYPE_HOSPITAL, types.TYPE_AIRPORT, types.TYPE_BAKERY,
59 types.TYPE_CITY_HALL, types.TYPE_FIRE_STATION, types.TYPE_PHARMACY,
60 types.TYPE_POLICE,types.TYPE_SUBWAY_STATION, types.TYPE_TRAIN_STATION, types.TYPE_UNIVERSITY]
62 # Create Dataframe whose columns have the distances to closest item by type
---> 63 closest_df = df.apply(lambda row: pd.Series(min_dist(row['lattitude'],
64 row['longitude'],
65 row['city_name'],
66 types_list),
67 index = types_list),
68 axis = 'columns',
69 result_type ='expand')
71 # Concatenate the two dfs columnwise
72 final_df = pd.concat([df, closest_df], axis = 1)
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/pandas/core/frame.py:8845, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
8834 from pandas.core.apply import frame_apply
8836 op = frame_apply(
8837 self,
8838 func=func,
(...)
8843 kwargs=kwargs,
8844 )
...
---> 36 end_lat_lng = query_result.places[0].geo_location['lat'], query_result.places[0].geo_location['lng']
38 # Add distance to closest to list by type
39 distances.append(geodesicdistance(start_lat_lng , end_lat_lng ))
IndexError: list index out of range
Update 2:
The code runs but I always get NaNs, even if I increase the ratious to 5000 mt instead of 500, I always get NaNs in all columns
The only thing I did is change the location by city_name, I hardcoded ,+"Belgium" on the location parameter to the cityname.
See dataset here:
https://raw.githubusercontent.com/Joffreybvn/real-estate-data-analysis/master/data/clean/belgium_real_estate.csv
Updated Code:
import geopy
from googleplaces import GooglePlaces, types, lang, ranking
import geopy.distance
import numpy as np
import pandas as pd
# Corrected geodesicdistance so it returns correct result rather than None
def geodesicdistance(start,end):
result = geopy.distance.geodesic(start, end).km
#print(result) # uncommment to print
return result
# Refactored getplace to getplaces to provide distance to items in type_list
def getplaces(start,
location,
types):
'''
Get distance to nearby item based for types
'''
YOUR_API_KEY = ''
google_places = GooglePlaces(YOUR_API_KEY)
# Compute the closest distance for each type
start_lat_lng = (start['lat'], start['lng'])
distances = []
for type_ in types:
# Find closest place for type_
query_result = google_places.nearby_search(
lat_lng = start, # A dict containing the following keys: lat, lng (default None)
location = location +",Belgium", # A human readable location, e.g 'London, England' (default None)
radius = 5000,
types = [type_],
rankby = ranking.DISTANCE
)
# 9/1/2022 --added try/except block to handle 0 responses
try:
# 0th element hast lat/lng for closest since we ranked by distance
end_lat_lng = query_result.places[0].geo_location['lat'], query_result.places[0].geo_location['lng']
# Add distance to closest to list by type
distances.append(geodesicdistance(start_lat_lng , end_lat_lng ))
except IndexError:
distances.append(np.nan) # did not return a value
return distances
def min_dist(latitude, longitude, location, types):
'''
Get distances for places closest to latitude/longitude
latitude/longitude - position to find nearby items from
location - human-readable name of the position
types - list of places to find by type
'''
start = {'lat': latitude, # spelling correction 9/1/2022
'lng': longitude}
return getplaces(start, location, types)
dfsmall= df.sample(10)
types_list = [types.TYPE_BUS_STATION, types.TYPE_HOSPITAL, types.TYPE_AIRPORT, types.TYPE_BAKERY,
types.TYPE_CITY_HALL, types.TYPE_FIRE_STATION, types.TYPE_PHARMACY,
types.TYPE_POLICE,types.TYPE_SUBWAY_STATION, types.TYPE_TRAIN_STATION, types.TYPE_UNIVERSITY]
# Create Dataframe whose columns have the distances to closest item by type
closest_df = dfsmall.apply(lambda row: pd.Series(min_dist(row['lattitude'],
row['longitude'],
row['city_name'],
types_list),
index = types_list),
axis = 'columns',
result_type ='expand')
# Concatenate the two dfs columnwise
final_df = pd.concat([dfsmall, closest_df], axis = 1)
print('Final df')
display(final_df)
Using pandas.DataFrame.apply with result_type = 'expand' to create columns by distance.
Example is Apply pandas function to column to create multiple new columns?
Code
import geopy
from googleplaces import GooglePlaces, types, lang, ranking
import geopy.distance
import numpy as np
import pandas as pd
# Corrected geodesicdistance so it returns correct result rather than None
def geodesicdistance(start,end):
result = geopy.distance.geodesic(start, end).km
#print(result) # uncommment to print
return result
# Refactored getplace to getplaces to provide distance to items in type_list
def getplaces(start,
location,
types):
'''
Get distance to nearby item based for types
'''
YOUR_API_KEY = 'YOUR API KEY'
google_places = GooglePlaces(YOUR_API_KEY)
# Compute the closest distance for each type
start_lat_lng = (start['lat'], start['lng'])
distances = []
for type_ in types:
# Find closest place for type_
query_result = google_places.nearby_search(
lat_lng = start, # A dict containing the following keys: lat, lng (default None)
location = location, # A human readable location, e.g 'London, England' (default None)
radius = 500,
types = [type_],
rankby = ranking.DISTANCE
)
# 9/1/2022 --added try/except block to handle 0 responses
try:
# 0th element hast lat/lng for closest since we ranked by distance
end_lat_lng = query_result.places[0].geo_location['lat'], query_result.places[0].geo_location['lng']
# Add distance to closest to list by type
distances.append(geodesicdistance(start_lat_lng , end_lat_lng ))
except IndexError:
distances.append(np.nan) # did not return a value
return distances
def min_dist(latitude, longitude, location, types):
'''
Get distances for places closest to latitude/longitude
latitude/longitude - position to find nearby items from
location - human-readable name of the position
types - list of places to find by type
'''
start = {'lat': latitude, # spelling correction 9/1/2022
'lng': longitude}
return getplaces(start, location, types)
Example Usage
# Create Dataframe using central areas of three cities in the USA
s = '''location latitude longitude
NYC 40.754101 -73.992081
Chicago 41.882702 -87.619392
Atlanta 33.753746 -84.386330'''
df = pd.read_csv(StringIO(s), sep = '\t')
print('Initial df')
display(df)
# List of 11 types used from googleplaces.types
types_list = [types.TYPE_BUS_STATION, types.TYPE_HOSPITAL, types.TYPE_AIRPORT, types.TYPE_BAKERY,
types.TYPE_CITY_HALL, types.TYPE_FIRE_STATION, types.TYPE_PHARMACY,
types.TYPE_POLICE,types.TYPE_SUBWAY_STATION, types.TYPE_TRAIN_STATION, types.TYPE_UNIVERSITY]
# Create Dataframe whose columns have the distances to closest item by type
closest_df = df.apply(lambda row: pd.Series(min_dist(row['latitude'],
row['longitude'],
row['location'],
types_list),
index = types_list),
axis = 'columns',
result_type ='expand')
# Concatenate the two dfs columnwise
final_df = pd.concat([df, closest_df], axis = 1)
print('Final df')
display(final_df)
Output
initial df
location latitude longitude
0 NYC 40.754101 -73.992081
1 Chicago 41.882702 -87.619392
2 Atlanta 33.753746 -84.386330
final_df
location latitude longitude bus_station hospital airport bakery city_hall fire_station pharmacy police subway_station train_station university
0 NYC 40.754101 -73.992081 0.239516 0.141911 0.196990 0.033483 NaN 0.181210 0.106216 0.248619 0.229708 0.407709 0.035780
1 Chicago 41.882702 -87.619392 0.288502 0.442175 0.957081 0.327077 1.024382 0.467242 0.249753 0.648701 0.565269 0.478530 0.424755
2 Atlanta 33.753746 -84.386330 0.374402 0.097586 0.424375 0.232727 0.548718 0.549474 0.286725 0.250114 0.366779 0.386960 0.029468

How to smoothen the value of Stochastic Oscillator from (14,1,3) to (14,3,3) in Numpy/Pandas?

I have a value where it produces the exact results as given for any stock at TradingView Website. This result is for Stochastic Oscillator with values (14,1,3). I want to know if I want to Smooth the value to (14,3,3), what would have to be done?
This is the blog which uses the same idea I am talking about and below is my code:
df.sort_index(ascending=False,inplace=True) #My stock is Newest First order
k_period = 14
d_period = 3
LOW,HIGH,CLOSE = "LOW", "HIGH", "CLOSE" # Column names
# Adds a "n_high" column with max value of previous 14 periods
df['n_high'] = df[HIGH].rolling(k_period).max()
# Adds an "n_low" column with min value of previous 14 periods
df['n_low'] = df[LOW].rolling(k_period).min()
# Uses the min/max values to calculate the %k (as a percentage)
df['%K'] = (df[CLOSE] - df['n_low']) * 100 / (df['n_high'] - df['n_low'])
# Uses the %k to calculates a SMA over the past 3 values of %k
df['%D'] = df['%K'].rolling(d_period).mean()
Found the solution. It was a silly adjustment. You need to .rolling_average() the Blue Line Also. Here is the adjusted code.
def Stochastic(data, k_period:int = 14, d_period:int = 3, smooth_k = 3, names:tuple = ('OPEN','CLOSE','LOW','HIGH'),return_df:bool=False):
'''
Implementation of the Stochastic Oscillator. Returns the Fast and Slow lines values or the whole DataFrame
args:
data: Pandas Dataframe of the stock
k_period: Period for the %K /Fast / Blue line
d_period: Period for the %D / Slow /Red / Signal Line
smooth_k: Smoothening the Fast line value. With increase/ decrease in number, it becomes the Fast or Slow Stochastic
names: Names of the columns which contains the corresponding values
return_df: Whether to return the DataFrame or the Values
out:
Returns either the Array containing (fast_line,slow_line) values or the entire DataFrame
'''
OPEN, CLOSE, LOW, HIGH = names
df = data.copy()
if df.iloc[0,0] > df.iloc[1,0]: # if the first Date entry [0,0] is > previous data entry [1,0] then it is in descending order, then reverse it for calculation
df.sort_index(ascending=False, inplace = True)
# Adds a "n_high" column with max value of previous 14 periods
df['n_high'] = df[HIGH].rolling(k_period).max()
# Adds an "n_low" column with min value of previous 14 periods
df['n_low'] = df[LOW].rolling(k_period).min()
# Uses the min/max values to calculate the %k (as a percentage)
df['Blue Line'] = (df[CLOSE] - df['n_low']) * 100 / (df['n_high'] - df['n_low']) # %K or so called Fast Line
if smooth_k > 1: # Smoothen the fast, blue line
df['Blue Line'] = df['Blue Line'].rolling(smooth_k).mean()
# Uses the %k to calculates a SMA over the past 3 values of %k
df['Red Line'] = df['Blue Line'].rolling(d_period).mean() # %D of so called Slow Line
df.drop(['n_high','n_low'],inplace=True,axis=1)
df.sort_index(ascending = True, inplace = True)
if return_df:
return df
return df.iloc[0,-2:] # Fast

Concatenating tables with axis=1 in Orange python

I'm fairly new to Orange.
I'm trying to separate rows of angle (elv) into intervals.
Let's say, if I want to separate my 90-degree angle into 8 intervals, or 90/8 = 11.25 degrees per interval.
Here's the table I'm working with
Here's what I did originally, separating them by their elv value
Here's the result that I want, x rows 16 columns separated by their elv value.
But I want them done dynamically.
I list them out and turn each list into a table with x rows and 2 columns.
This is what I originally did
from Orange.data.table import Table
from Orange.data import Domain, Domain, ContinuousVariable, DiscreteVariable
import numpy
import pandas as pd
from pandas import DataFrame
df = pd.DataFrame()
num = 10 #number of intervals that we want to seperate our elv into.
interval = 90.00/num #separating them into degree/interval
low = 0
high = interval
table = []
first = []
second = []
for i in range(num):
between = []
if i != 0: #not the first run
low = high
high = high + interval
for row in in_data: #Run through the whole table to see if the elv falls in between interval
if row[0] >= low and row[0] < high:
between.append(row)
elv = "elv" + str(i)
err = "err" + str(i)
domain = Domain([ContinuousVariable.make(err)],[ContinuousVariable.make(elv)])
data = Table.from_numpy(domain, numpy.array(between))
print("table number ", i)
print(data[:3])
Here's the output
But as you can see, these are separated tables being assigned every loop.
And I have to find a way to concatenate axis = 1 for these tables.
Even the source code for Orange3 forbids this for some reason.

first attempt at python, error ("IndexError: index 8 is out of bounds for axis 0 with size 8") and efficiency question

learning python, just began last week, havent otherwise coded for about 20 years and was never that advanced to begin with. I got the hello world thing down. Now im trying to back test FX pairs. Any help up the learning curve appreciated, and of course scouring this site while on my Lynda vids.
Getting a funky error, and also wondering if theres blatantly more efficient ways to loop through columns of excel data the way I am.
The spreadsheet being read is simple ... 56 FX pairs down column A, and 8 rows over where the column headers are dates, and the cells in each column are the respective FX pair closing price on that date. The strategy starts at the top of the 2nd column (so that there is a return % that can be calc'd vs the prior priord) and calcs out period/period % returns for each pair, identifying which is the 'maximum value', and then "goes long" that highest performer ... whose performance in the subsequent period/period is recorded as PnL to the portfolio ("p" in the code), loops through that until the current, most recent column is read.
The error relates to using 8 columns instead of 7 ... works when i limit the loop to 7 columns but not 8. When I used 8 I get a wall of text concluding with "IndexError: index 8 is out of bounds for axis 0 with size 8" Similar error when i use too many rows, 56 instead of 55, think im missing the bottom row.
Here's my code:
,,,
enter code here
#set up imports
import pandas as pd
#import spreadsheet
x1 = pd.ExcelFile(r"C:\Users\Gamblor\Desktop\Python\test2020.xlsx")
df = pd.read_excel(x1, "Sheet1", header=1)
#define counters for loops
o = 1 # observation counter
c = 3 # column counter
r = 0 # active row counter for sorting through for max
#define identifiers for the portfolio
rpos = 0 # static row, for identifying which currency pair is in column 0 of that row
p = 100 # portfolio size starts at $100
#define the stuff we are evaluating for
pair = df.iat[r,0] # starting pair at 0,0 where each loop will begin
pair_pct_rtn = 0 # starts out at zero, becomes something at first evaluation, then gets
compared to each subsequent eval
pair_pct_rtn_calc = 0 # a second version of above, for comparison to prior return
#runs a loop starting at the top to find the max period/period % return in a specific column
while (c < 8): # manually limiting this to 5 columns left to right
while (r < 55): # i am manually limiting this to 55 data rows per the spreadsheet ... would be better if automatic
pair_pct_rtn_calc = ((df.iat[r,c])/(df.iat[r,c-1]) - 1)
if pair_pct_rtn_calc > pair_pct_rtn: # if its a higher return, it must be the "max" to that point
pair = df.iat[r,0] # identifies the max pair for this column observation, so far
pair_pct_rtn = pair_pct_rtn_calc # sets pair_pct_rtn as the new max
rpos = r # identifies the max pair's ROW for this column observation, so far
r = r + 1 # adds to r in order to jump down and calc the next row
print('in obs #', o ,', ', pair ,'did best at' ,pair_pct_rtn ,'.')
o = o + 1
# now adjust the portfolio by however well USDMXN did in the subsequent week
p = p * ( 1 + ((df.iat[rpos,c+1])/(df.iat[rpos,c]) - 1))
print('then the subsequent period it did: ',(df.iat[rpos,c+1])/(df.iat[rpos,c]) - 1)
print('resulting in portfolio value of', p)
rpos = 0
r = 0
pair_pct_rtn = 0
c = c + 1 # adds to c in order to move to the next period to the right
print(p)
Since indices are labelled from 0 onwards, the 8th element you are looking for will have index 7. Likewise, row index 55 (the 56th row) will be your last row.

How to improve efficiency in while loop by pandas

I am a new python er. in my job, I open deal mass of data. So I begin to study python to improve the efficiency.
The first small trial is that: finding the nearest distance between two coordinates.
I have two files, one is named as "book.csv", the other is named as "macro.csv".[file content screen shot][1]
book.csv has three column: BookName, Longitude,Latitude; macro.csv has threed column: MacroName, Longitude,Latitude.
the trial purpose is to find the nearest Macro to each book. I try to use pandas to finish this trial, now I can get the right result, but the efficiency is a little low, when I have a 1500 book and 200 macro, it will take about 15 second.
please to help whether I can improve the efficiency. thx the following is my trial code:
#import pandas lib
from pandas import Series,DataFrame
import pandas as pd
#import geopy lib, to calculate the distance between two poins
import geopy.distance
#def func, to calculate the distance, input parameter: two points coordinates(Lat,Lon),return m
def dist(coord1,coord2):
return geopy.distance.vincenty(coord1, coord2).m
#def func, to find the nearest result: including MacroName and distance
def find_nearest_macro(df_macro,df_book):
#Get column content from dataframe to series
# Macro
s_macro_name = df_macro["MacroName"]
s_macro_Lat = df_macro["Latitude"]
s_macro_Lon = df_macro["Longitude"]
# Book
s_book_name = df_book["BookName"]
s_book_Lat = df_book["Latitude"]
s_book_Lon = df_book["Longitude"]
#def a empty list, used to append nearest result
nearest_macro = []
nearest_dist = []
#Loop through each book
ibook = 0
while ibook < len(s_book_name):
#Give initial value to result
nearest_macro_name = s_macro_name[0]
nearest_macro_dist = dist((s_book_Lat[0],s_book_Lon[0]), (s_macro_Lat[0],s_macro_Lon[0]))
#Get the coordinate of the x book
book_coord = (s_book_Lat[ibook],s_book_Lon[ibook])
#Loop through each Macro, Reset the loop variable
imacro = 1
while imacro < len(s_macro_name):
# Get the coordinate of the x Macro
macro_cood = (s_macro_Lat[imacro],s_macro_Lon[imacro])
#Calculate the distance between book and macro
tempd = dist(book_coord,macro_cood)
#if distance more close
if tempd < nearest_macro_dist:
#Update the result
nearest_macro_dist = tempd
nearest_macro_name = s_macro_name[imacro]
#Increments the loop variable
imacro = imacro + 1
#Loop over each book, append the nearest to the result
nearest_macro.append(nearest_macro_name)
nearest_dist.append(nearest_macro_dist)
# Increments the loop variable
ibook = ibook + 1
#return nearest macro name and distance(by tuple way can return 2 results
return (nearest_macro,nearest_dist)
# Assign the filename:
file_macro = '.\\TestFile\\Macro.csv'
file_book = '.\\TestFile\\Book.csv'
#read content from csv to dataframe
df_macro = pd.read_csv(file_macro)
df_book = pd.read_csv(file_book)
#find the nearest macro name and distance
t_nearest_result = find_nearest_macro(df_macro,df_book)
#create a new series, convert list to Series
s_nearest_marco_name = Series(t_nearest_result[0])
s_nearest_macro_dist = Series(t_nearest_result[1])
#insert the new Series to dataframe
df_book["NearestMacro"] = s_nearest_marco_name
df_book["NearestDist"] = s_nearest_macro_dist
print(df_book.head())
# write the new df_book to a new csv file
df_book.to_csv('.\\TestFile\\nearest.csv')

Categories