Add a column to a dataframe in Python

Add a column to a dataframe in Python - python

I am trying to add a few columns to a dataframe - here is the code
import import_ipynb
import talib
import numpy
import yfinance as yf
import datetime as dt
import time
from datetime import datetime, timedelta
import sqlite3
import pandas
import numpy as np
conn = sqlite3.connect('Strategy_RSI_MACD_Data.db')
c = conn.cursor()
c.execute("select distinct Stock from Universe")
tickers = c.fetchall()
for row in tickers:
if row[0]:
ticker_list.append(row[0])
stockdetails = yf.download(
tickers = ticker_list,
period = '6mo',
interval = '1d',
group_by = 'ticker',
auto_adjust = False,
prepost = False,
threads = True,
proxy = None
)
df_ta = pandas.DataFrame(data = stockdetails['Adj Close'], dtype=numpy.float64)
stockdetails['RSI'] = df_ta.apply(lambda c: talib.RSI(c, timeperiod = 14))
The last line is throwing this error:
ValueError: Wrong number of items passed 505, placement implies 1
How can I fix this?

Your lambda function is returning 505 values whereas your assignment should have just one. Try converting the output into a list-
stockdetails['RSI'] = [df_ta.apply(lambda c: talib.RSI(c, timeperiod = 14))]

I figured it out!! - I needed to insert a loop that would loop through the values:
for row in tickers:
c.execute("select [Adj Close] from StockData where Symbol = ? ", (row))
AdjClose = c.fetchall()
df_ta = pd.DataFrame(data = AdjClose, dtype=numpy.float64)
df_ta = df_ta.apply(lambda c: talib.RSI(c, timeperiod = 14))

Related

How can I print the results from this script? I can't get any results in my IDE

I am trying to see the results for this script in Spyder, but I can't get it to print and I'm not sure how. I tried print(options), print(opt), print(exps), but nothing seems to be working. I don't get any "errors" either... I just get the normal In[number]: runfile(my path)
import pandas as pd
import yfinance as yf
import datetime
def options_chain(symbol):
tk = yf.Ticker(symbol)
# Expiration dates
exps = tk.options
# Get options for each expiration
options = pd.DataFrame()
for e in exps:
opt = tk.option_chain(e)
opt = pd.DataFrame().append(opt.calls).append(opt.puts)
opt['expirationDate'] = e
options = options.append(opt, ignore_index=True)
# Bizarre error in yfinance that gives the wrong expiration date
# Add 1 day to get the correct expiration date
options['expirationDate'] = pd.to_datetime(options['expirationDate']) + datetime.timedelta(days = 1)
options['dte'] = (options['expirationDate'] - datetime.datetime.today()).dt.days / 365
# Boolean column if the option is a CALL
options['CALL'] = options['contractSymbol'].str[4:].apply(
lambda x: "C" in x)
options[['bid', 'ask', 'strike']] = options[['bid', 'ask', 'strike']].apply(pd.to_numeric)
options['mark'] = (options['bid'] + options['ask']) / 2 # Calculate the midpoint of the bid-ask
# Drop unnecessary and meaningless columns
options = options.drop(columns = ['contractSize', 'currency', 'change', 'percentChange', 'lastTradeDate', 'lastPrice'])
return options
print(options)

Overwriting one data with another data in pandas(dataframe)

Periodically (every 120 seconds) get data but recent data overwrites previous data in SQL DB. I want all data to be saved.In addition, is the timer correct?
import sqlalchemy as sa
import psycopg2
import requests as rq
import pandas as pd
import json
import time
start_time = time.time()
while True:
temp = pd.DataFrame()
df = pd.DataFrame()
vehicleList = {"SN63NBK", "YY67UTP"}
for ids in vehicleList:
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
r = r.text
temp = pd.read_json(r)
temp['Type'] = 'ids'
df = pd.concat([df, temp], sort=False).reset_index(drop=True)
engine = sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus_pg6', engine, if_exists='replace', index=False)
time.sleep(120.0 - ((time.time() - start_time) % 120.0))

I changed your code slightly, but I think the main problem is in if_exists parameter which you should set to append, as #K753 have mentioned in the comments.
Also, YY67UTP id returns nothing, so I replaced it with another random id from the site to illustrate how code works.
def _data_gen(vehicles):
""" Yields a dataframe for each request """
for ids in vehicles:
time.sleep(1)
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
temp = pd.read_json(r.text)
temp['Type'] = ids
yield temp
while True:
# how do you break from while loop if you need to?
vehicleList = {"SN63NBK", "YY67UTP"}
df = pd.concat(_data_gen(vehicleList), sort=False, ignore_index=True)
engine = sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus_pg6', engine, if_exists='append', index=False)
time.sleep(120)

Use of function in pandas dataframe with multiprocessing

I am attempting to speed up calculations on a pandas DataFrame using multiprocessing which goes really well minus the fact that assigning the result of the calculation to the df.ix does not work here like it does in my code without trying multiprocessing here
I've added a #sanity check to the code which outputs valid values and would make me think this would work just fine, but the DataFrame doesn't get populated (stays as NaN). Does anyone know why that may be, and more importantly, what changes may be needed to plug the values into the DataFrame in the context of multiprocessing?
Output of sanity check:
should be setting df.ix[4][1] to: 23.2506112824
should be setting df.ix[0][0] to: 0.0
should be setting df.ix[7][0] to: 15.9574526264
code:
import mysql.connector
import numpy as np
from colormath.color_objects import LabColor
from colormath.color_diff import delta_e_cie2000
import pandas as pd
from mysql.connector.pooling import MySQLConnectionPool
from multiprocessing import Pool
pool = Pool()
cnx = mysql.connector.connect(user='user', password='pass',host='localhost', database='database')
cursor = cnx.cursor()
selectstmt = 'SELECT CIE_Lab, ID FROM `database`.`table`'
cursor.execute(selectstmt)
color = cursor.fetchall()
df = pd.DataFrame(columns = color, index = color)
sides = df.index
headers = df.dtypes.index
shape = df.shape[0]
def delta(cie_Lab1, cie_Lab2):
cie_Lab1 = cie_Lab1[1:]
cie_Lab1 = cie_Lab1[:-1]
cie_Lab2 = cie_Lab2[1:]
cie_Lab2 = cie_Lab2[:-1]
CIE_list1 = cie_Lab1.split(",")
CIE_list2 = cie_Lab2.split(",")
#print CIE_list1
CIE_L1 = CIE_list1[0]
CIE_a1 = CIE_list1[1]
CIE_b1 = CIE_list1[2]
CIE_L2 = CIE_list2[0]
CIE_a2 = CIE_list2[1]
CIE_b2 = CIE_list2[2]
color1 = LabColor(lab_l=CIE_L1, lab_a=CIE_a1, lab_b=CIE_b1)
color2 = LabColor(lab_l=CIE_L2, lab_a=CIE_a2, lab_b=CIE_b2)
deltae = delta_e_cie2000(color1, color2, Kl=1, Kc=1, Kh=1)
return deltae
def deltas(nums):
listoflists = []
for num in range(nums):
for mun in range(nums):
listoflists.append([num,mun])
return listoflists
def update(inp):
sides = df.index
headers = df.dtypes.index
num = inp[0]
mun = inp[1]
res = delta(headers[num][0], sides[mun][0])
#sanity check
print "should be setting df.ix["+str(mun)+"]["+str(num)+"] to: "+str(res)
df.ix[mun][num] = res
if __name__ == '__main__':
pool = Pool(4)
pool.map(update, deltas(shape))
pool.close()
pool.join()
print df
Dataframe example:
([69.62248143012944, -54.15108764844451, 67.92070706614288], 1) \
([69.62248143012944, -54.15108764844451, 67.920... NaN
([58.17848217611454, -52.251714243997995, 56.77... NaN
([87.02539335188214, -32.15758725885986, 66.450... NaN
([86.86259502866965, -31.483524711078015, 75.14... NaN
([85.39154525710671, -31.683349117376856, 71.35... NaN

Python: invalid syntax: <string>, line 1, pos 16

I have developed a code in Python in which -in order to run the program- I need to take some arguments from the command line. But I am getting continuously the same error:
Traceback (most recent call last):
File "<string>", line 1, in <fragment>
invalid syntax: <string>, line 1, pos 16
I have the faintest idea what is wrong with my code. So, I present my code below in case someone could help me:
import QSTK.qstkutil.qsdateutil as du
import QSTK.qstkutil.tsutil as tsu
import QSTK.qstkutil.DataAccess as da
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import math
import copy
import QSTK.qstkstudy.EventProfiler as ep
import csv
import sys
import argparse
def readData(li_startDate, li_endDate, ls_symbols):
#Create datetime objects for Start and End dates (STL)
dt_start = dt.datetime(li_startDate[0], li_startDate[1], li_startDate[2])
dt_end = dt.datetime(li_endDate[0], li_endDate[1], li_endDate[2])
#Initialize daily timestamp: closing prices, so timestamp should be hours=16 (STL)
dt_timeofday = dt.timedelta(hours=16)
#Get a list of trading days between the start and end dates (QSTK)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
#Create an object of the QSTK-dataaccess class with Yahoo as the source (QSTK)
c_dataobj = da.DataAccess('Yahoo', cachestalltime=0)
#Keys to be read from the data
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
#Read the data and map it to ls_keys via dict() (i.e. Hash Table structure)
ldf_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
d_data = dict(zip(ls_keys, ldf_data))
return [d_data, dt_start, dt_end, dt_timeofday, ldt_timestamps]
def marketsim(cash,orders_file,values_file):
orders = pd.read_csv(orders_file,index_col='Date',parse_dates=True,header=None)
ls_symbols = list(set(orders['X.4'].values))
df_lastrow = len(orders) - 1
dt_start = dt.datetime(orders.get_value(0, 'X.1'),orders.get_value(0, 'X.2'),orders.get_value(0, 'X.3'))
dt_end = dt.datetime(orders.get_value(df_lastrow, 'X.1'),orders.get_value(df_lastrow, 'X.2'),orders.get_value(df_lastrow, 'X.3') + 1 )
#d_data = readData(dt_start,dt_end,ls_symbols)
#Initialize daily timestamp: closing prices, so timestamp should be hours=16 (STL)
dt_timeofday = dt.timedelta(hours=16)
#Get a list of trading days between the start and end dates (QSTK)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
#Create an object of the QSTK-dataaccess class with Yahoo as the source (QSTK)
c_dataobj = da.DataAccess('Yahoo', cachestalltime=0)
#Keys to be read from the data
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
#Read the data and map it to ls_keys via dict() (i.e. Hash Table structure)
df_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
d_data = dict(zip(ls_keys, ldf_data))
ls_symbols.append("_CASH")
trades = pd.Dataframe(index=list(ldt_timestamps[0]),columns=list(ls_symbols))
current_cash = cash
trades["_CASH"][ldt_timestamps[0]] = current_cash
current_stocks = dict()
for symb in ls_symbols:
current_stocks[symb] = 0
trades[symb][ldt_timestamps[0]] = 0
for row in orders.iterrows():
row_data = row[1]
current_date = dt.datetime(row_data['X.1'],row_data['X.2'],row_data['X.3'],16)
symb = row_data['X.4']
stock_value = d_data['close'][symb][current_date]
stock_amount = row_data['X.6']
if row_data['X.5'] == "Buy":
current_cash = current_cash - (stock_value*stock_amount)
trades["_CASH"][current_date] = current_cash
current_stocks[symb] = current_stocks[symb] + stock_amount
trades[symb][current_date] = current_stocks[symb]
else:
current_cash = current_cash + (stock_value*stock_amount)
trades["_CASH"][current_date] = current_cash
current_stocks[symb] = current_stocks[symb] - stock_amount
trades[symb][current_date] = current_stocks[symb]
#trades.fillna(method='ffill',inplace=True)
#trades.fillna(method='bfill',inplace=False)
trades.fillna(0)
#alt_cash = current_cash
#alt_cash = trades.cumsum()
value_data = pd.Dataframe(index=list(ldt_timestamps),columns=list("V"))
value_data = value_data.fillna(0)
value_data = value_data.cumsum(axis=0)
for day in ldt_timestamps:
value = 0
for sym in ls_symbols:
if sym == "_CASH":
value = value + trades[sym][day]
else:
value = calue + trades[sym][day]*d_data['close'][sym][day]
value_data["V"][day] = value
fileout = open(values_file,"w")
for row in value_data.iterrows():
file_out.writelines(str(row[0].strftime('%Y,%m,%d')) + ", " + str(row[1]["V"].round()) + "\n" )
fileout.close()
def main(argv):
if len(sys.argv) != 3:
print "Invalid arguments for marketsim.py. It should be of the following syntax: marketsim.py orders_file.csv values_file.csv"
sys.exit(0)
#initial_cash = int (sys.argv[1])
initial_cash = 1000000
ordersFile = str(sys.argv[1])
valuesFile = str(sys.argv[2])
marketsim(initial_cash,ordersFile,valuesFile)
if __name__ == "__main__":
main(sys.argv[1:])
The input I gave to the command line was:
python marketsim.py orders.csv values.csv
I guess that the problem lies either into the imports or probably into the main function(incl. the if below the def main(argv)
I have to point out that the files orders.csv and values.csv exist and are located into the same folder.
I hope have made everything clear.
So, I am looking forward to reading your answers community-mates! :D
Thank you!

Pandas duration of drawdown

I am trying to calculate the duration of the drawdowns and the time to recovery for a stock series. I can calculate the drawdowns but am struggling to the the durations and recovery time for each drawdown. So far I have this code:
import pandas as pd
import pickle
import xlrd
import numpy as np
np.random.seed(0)
df = pd.Series(np.random.randn(2500)*0.7+0.05, index=pd.date_range('1/1/2000', periods=2500, freq='D'))
df= 100*(1+df/100).cumprod()
df=pd.DataFrame(df)
df.columns = ['close']
df['ret'] = df.close/df.close[0]
df['modMax'] = df.ret.cummax()
df['modDD'] = 1-df.ret.div(df['modMax'])
groups = df.groupby(df['modMax'])
dd = groups['modMax','modDD'].apply(lambda g: g[g['modDD'] == g['modDD'].max()])
top10dd = dd.sort_values('modDD', ascending=False).head(10)
top10dd
This gives the 10 highest drawdowns of the series but I also want the duration of the drawdown and time to recovery.

I solved the problem as follows:
def drawdown_group(df,index_list):
group_max,dd_date = index_list
ddGroup = df[df['modMax'] == group_max]
group_length = len(ddGroup)
group_dd = ddGroup['dd'].max()
group_dd_length = len(ddGroup[ddGroup.index <= dd_date])
group_start = ddGroup[0:1].index[0]
group_end = ddGroup.tail(1).index[0]
group_rec = group_length - group_dd_length
#print (group_start,group_end,group_dd,dd_date,group_dd_length,group_rec,group_length)
return group_start,group_end,group_max,group_dd,dd_date,group_dd_length,group_rec,group_length
dd_col = ('start','end','peak', 'dd','dd_date','dd_length','dd_rec','tot_length')
df_dd = pd.DataFrame(columns = dd_col)
for i in range(1,10):
index_list = top10dd[i-1:i].index.tolist()[0]
#print(index_list)
start,end,peak,dd,dd_date,dd_length,dd_rec,tot_length = drawdown_group(df,index_list)
#print(start,end,dd,dd_date,dd_length,dd_rec,tot_length)
df_dd.loc[i-1] = drawdown_group(df,index_list)
Produces this table:

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Add a column to a dataframe in Python - python

Your lambda function is returning 505 values whereas your assignment should have just one. Try converting the output into a list- stockdetails['RSI'] = [df_ta.apply(lambda c: talib.RSI(c, timeperiod = 14))]

Related

How can I print the results from this script? I can't get any results in my IDE

Overwriting one data with another data in pandas(dataframe)

Use of function in pandas dataframe with multiprocessing

Python: invalid syntax: <string>, line 1, pos 16

Pandas duration of drawdown

Categories

Resources