Pandas duration of drawdown - python

I am trying to calculate the duration of the drawdowns and the time to recovery for a stock series. I can calculate the drawdowns but am struggling to the the durations and recovery time for each drawdown. So far I have this code:
import pandas as pd
import pickle
import xlrd
import numpy as np
np.random.seed(0)
df = pd.Series(np.random.randn(2500)*0.7+0.05, index=pd.date_range('1/1/2000', periods=2500, freq='D'))
df= 100*(1+df/100).cumprod()
df=pd.DataFrame(df)
df.columns = ['close']
df['ret'] = df.close/df.close[0]
df['modMax'] = df.ret.cummax()
df['modDD'] = 1-df.ret.div(df['modMax'])
groups = df.groupby(df['modMax'])
dd = groups['modMax','modDD'].apply(lambda g: g[g['modDD'] == g['modDD'].max()])
top10dd = dd.sort_values('modDD', ascending=False).head(10)
top10dd
This gives the 10 highest drawdowns of the series but I also want the duration of the drawdown and time to recovery.

I solved the problem as follows:
def drawdown_group(df,index_list):
group_max,dd_date = index_list
ddGroup = df[df['modMax'] == group_max]
group_length = len(ddGroup)
group_dd = ddGroup['dd'].max()
group_dd_length = len(ddGroup[ddGroup.index <= dd_date])
group_start = ddGroup[0:1].index[0]
group_end = ddGroup.tail(1).index[0]
group_rec = group_length - group_dd_length
#print (group_start,group_end,group_dd,dd_date,group_dd_length,group_rec,group_length)
return group_start,group_end,group_max,group_dd,dd_date,group_dd_length,group_rec,group_length
dd_col = ('start','end','peak', 'dd','dd_date','dd_length','dd_rec','tot_length')
df_dd = pd.DataFrame(columns = dd_col)
for i in range(1,10):
index_list = top10dd[i-1:i].index.tolist()[0]
#print(index_list)
start,end,peak,dd,dd_date,dd_length,dd_rec,tot_length = drawdown_group(df,index_list)
#print(start,end,dd,dd_date,dd_length,dd_rec,tot_length)
df_dd.loc[i-1] = drawdown_group(df,index_list)
Produces this table:

Related

Pytrends Multiple geo codes

I use the codes below to export the number of searches for multiple states in US and get the error "ResponseError: The request failed: Google returned a response with code 400."
from pytrends.request import TrendReq
import pandas as pd
import time
startTime = time.time()
pytrend = TrendReq(hl='en-US', tz=360)
colnames = ["keywords"]
df = pd.read_csv("keyword_list.csv", names=colnames)
df2 = df["keywords"].values.tolist()
df2.remove("Keywords")
dataset = []
for x in range(0,len(df2)):
keywords = [df2[x]]
pytrend.build_payload(
kw_list=keywords,
cat=0,
timeframe='2020-01-01 2020-02-01',
geo='US-MA,US-TX,US-NY,US-WA')
data = pytrend.interest_over_time()
if not data.empty:
data = data.drop(labels=['isPartial'],axis='columns')
dataset.append(data)
result = pd.concat(dataset, axis=1)
result.to_csv("US.csv")
Issue is with the geo parameter, You can provide one at a time not all, You can try like this
from pytrends.request import TrendReq
import pandas as pd
import time
startTime = time.time()
pytrend = TrendReq(hl='en-US', tz=360)
colnames = ["keywords"]
df = pd.read_csv("keyword_list.csv", names=colnames)
df2 = df["keywords"].values.tolist()
df2.remove("Keywords")
dataset = []
for geo_code in ['US-MA','US-TX','US-NY','US-WA']:
for x in range(0,len(df2)):
keywords = [df2[x]]
pytrend.build_payload(
kw_list=keywords,
cat=0,
timeframe='2020-01-01 2020-02-01',
geo=geo_code)
data = pytrend.interest_over_time()
if not data.empty:
data = data.drop(labels=['isPartial'],axis='columns')
data['geo']=geo_code
dataset.append(data)
result = pd.concat(dataset)
result.to_csv("US.csv")

Add a column to a dataframe in Python

I am trying to add a few columns to a dataframe - here is the code
import import_ipynb
import talib
import numpy
import yfinance as yf
import datetime as dt
import time
from datetime import datetime, timedelta
import sqlite3
import pandas
import numpy as np
conn = sqlite3.connect('Strategy_RSI_MACD_Data.db')
c = conn.cursor()
c.execute("select distinct Stock from Universe")
tickers = c.fetchall()
for row in tickers:
if row[0]:
ticker_list.append(row[0])
stockdetails = yf.download(
tickers = ticker_list,
period = '6mo',
interval = '1d',
group_by = 'ticker',
auto_adjust = False,
prepost = False,
threads = True,
proxy = None
)
df_ta = pandas.DataFrame(data = stockdetails['Adj Close'], dtype=numpy.float64)
stockdetails['RSI'] = df_ta.apply(lambda c: talib.RSI(c, timeperiod = 14))
The last line is throwing this error:
ValueError: Wrong number of items passed 505, placement implies 1
How can I fix this?
Your lambda function is returning 505 values whereas your assignment should have just one. Try converting the output into a list-
stockdetails['RSI'] = [df_ta.apply(lambda c: talib.RSI(c, timeperiod = 14))]
I figured it out!! - I needed to insert a loop that would loop through the values:
for row in tickers:
c.execute("select [Adj Close] from StockData where Symbol = ? ", (row))
AdjClose = c.fetchall()
df_ta = pd.DataFrame(data = AdjClose, dtype=numpy.float64)
df_ta = df_ta.apply(lambda c: talib.RSI(c, timeperiod = 14))

How to square the individual matrix value using python?

Cost function implemented with Python:
**Thanks for help to achieve this.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
load_data = pd.read_csv('C:\python_program\ex1data1.txt',sep = ",",header = None)
feature_vale = load_data[0]
y = np.matrix(load_data[1])
m = len(feature_vale)
plt.scatter(load_data[0],load_data[1],marker='+',c = 'r')
plt.title("Cost_Function")
plt.xlabel("Population of City in 10,000s")
plt.ylabel("Profit in $10,000s")
df = pd.DataFrame(pd.Series(1,index= range(0,m)))
df[1] = load_data[0]
X = np.matrix(df)
row_theta = np.zeros(2,dtype = int)
theta = np.array([row_theta]) # Transpose the array
prediction = np.dot(X,theta.T)
error = (prediction-y.T)
error_df = pd.DataFrame(error)
#square the error
squared_error = np.square(error_df)
sum = np.sum(squared_error)
print(sum)
J = np.sum(squared_error) / (2 * m)
print(J)
Data reference link: searchcode.com/codesearch/view/5404318
repeat the following steps and let me know
load_data = pd.read_csv('data.txt',sep = ",",header = None)
feature_vale = load_data[0]
y = np.matrix(load_data[1])
m = len(feature_vale)
#print(m)
#plt.scatter(load_data[0],load_data[1])
df = pd.DataFrame(pd.Series(1,index= range(0,m)))
df[1] = load_data[0]
X = np.matrix(df)
row_theta = np.zeros(2,dtype = int)
theta = np.array([row_theta]) # Transpose the array
print(theta.T)
prediction = np.matmul(X,theta.T)
error = (prediction-y)
error_df = pd.DataFrame(error)
squared_error = np.square(error_df)
print(squared_error)

How to run a function on each row in DataFrame and append the result to a new DataFrame

NB My code runs if copied
I wrote a simple script to backtest cryptocurrencies using the poloniex API.
First I request the data from the API and turn it into a dataframe data.
Then I take the data I want and make new df called df
A function trade must then be run on each line in df, simple put if the price is above the rolling mean it buys and sells if below, this data is then saved in log.
I am having trouble applying this function on each row in df.
I had great success using the line log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1) BUT surprising it works when BTC_ETH is used in the API call and not for others ie BTC_FCT or BTC_DOGE despite the data being identical in form. Using ETH results in the creation of DataFrame (which is what i want) DOGE and FCT creates a Series
First question, how can I run my trade function on each row and create a new df log with the results
Bonus question, even though the data types are the same why does it work for ETH but not for DOGE/FCT ?
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData&currencyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
print()
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
EDIT:
I solved the problem, I fixed it by appending the dicts to list and then using the df.from_dict() method to create the log dataframe, my code just to clarify.
def trade(date, close, MA):#, port):
#d = {'Data': close}
#test_log = test_log.append(d, ignore_index=True)
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
for key,value in port.items():
print(key, value )
log.from_dict(data_list)
The problem is that you are not always returning a value in trade, which is confusing Pandas. Try this:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData&currencyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
d = {'Date': date, 'type':'', 'coin_value': np.nan, 'btc_value': np.nan}
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d['type'] = 'buy'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d['type'] = 'sell'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
However, as I mentioned in the comment, passing a function with side-effects to apply is not a good idea according to the documentation, and in fact I think it may not produce the correct result in your case.

I want to create a time series of monthly means in Pandas

I have a dataframe that consists of hourly data for a whole year. I want to calculate the monthly means and show them in a time series plot. I have one variable which is NO2 values.
#Cleaning data
ck_2000 = pd.read_csv('2000-CamdenKerbside.csv', header=0,skiprows=4,usecols=range(0,3),skipfooter = 1, na_values = 'No data',engine = 'python')
colnames = ['Date', 'Time', 'NO2']
ck_2000.columns = colnames
#Reformat date/time
ck_2000.Time.replace(to_replace = '24:00:00', value = '00:00:00', inplace = True)
dtw = pd.to_datetime(ck_2000.Date + ck_2000.Time,format='%d/%m/%Y%H:%M:%S')
ck_2000.index = dtw
#Index dataframe by date
firstDate = ck_2000.index[0]
lastDate = ck_2000.index[len(ck_2000.Date) - 1]
ck2000 = ck_2000.reindex(index=pd.date_range(start = firstDate, end =lastDate, freq = '1H'), fill_value= None)
#Change data type to float
ck2000['NO2'] = ck2000['NO2'].dropna().astype('int64')
#Interpolation
ck_2000_int = ck_2000.interpolate()
#df's for all months
ck_2000_jan = ck_2000_int['2000-01']
ck_2000_feb = ck_2000_int['2000-02']
ck_2000_mar = ck_2000_int['2000-03']
ck_2000_apr = ck_2000_int['2000-04']
ck_2000_may = ck_2000_int['2000-05']
ck_2000_jun = ck_2000_int['2000-06']
ck_2000_jul = ck_2000_int['2000-07']
ck_2000_aug = ck_2000_int['2000-08']
ck_2000_sept = ck_2000_int['2000-09']
ck_2000_oct = ck_2000_int['2000-10']
ck_2000_nov = ck_2000_int['2000-11']
ck_2000_dec = ck_2000_int['2000-12']
you should be able to use resample
Consider the following example
tidx = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H')
ck_2000_int = pd.DataFrame(dict(NO2=np.random.randn(len(tidx))), tidx)
ck_2000_int.resample('M').mean().plot()

Categories