dataframe append new series column with data - python

I have a Panda DataFrame structure and I want to add another column to it, but I can't do it with append, add or insert.
I'm trying to replicate the portfolio data with the Panda's built-in function, because this script doesn't give me correct data if the period that I request is lower than ~ 1,5 years while data must be obtained even for two days if I want. So here's the script that I want to rewrite:
import QSTK.qstkutil.qsdateutil as du
import QSTK.qstkutil.tsutil as tsu
import QSTK.qstkutil.DataAccess as da
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
ls_symbols = ["AAPL", "GLD", "GOOG", "$SPX", "XOM"]
dt_start = dt.datetime(2006, 1, 1)
dt_end = dt.datetime(2010, 12, 31)
dt_timeofday = dt.timedelta(hours=16)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
c_dataobj = da.DataAccess('Yahoo')
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
ldf_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
**d_data = dict(zip(ls_keys, ldf_data))**
d_data = dict(zip(ls_keys, ldf_data)) is what I want to replicate because it doesn't fetch the data that I want, but I need to figure out a way to append a new column to my dict. Here is my script:
from pandas.io.data import DataReader, DataFrame
import QSTK.qstkutil.qsdateutil as du
import QSTK.qstkutil.DataAccess as da
import datetime as dt
def get_historical_data(symbol, source, date_from, date_to):
global data_validator
symbol_data = {}
ls_keys = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
for key in ls_keys:
symbol_data[key] = DataFrame({})
dataframe_open = DataFrame({})
for item in symbol:
print 'Fetching data for:', item
current_data = DataReader(str(item), source, date_from, date_to)
dataframe_open = {item : current_data['Open']}
if len(symbol_data['Open'].columns) == 0:
symbol_data['Open'] = DataFrame(dataframe_open)
else:
**#i want to add the new column here but can't seem to do this.**
#symbol_data['Open'].loc[:item] = DataFrame(dataframe_open)
pass
return symbol_data
P.S. I call the func with these parameters for testing purposes:
test = get_historical_data(['SPY', 'DIA'], 'yahoo', datetime(2015,1,1), datetime(2015,1,31))

Does the following help? Have not tested yet, but should work in principle... Just put the data in arrays of equal length and construct the data frame from that.
def get_historical_data(symbols=[], source=None, date_from=None, date_to=None):
global data_validator
symbol_data = {}
ls_keys = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
data = []
for item in ls_keys:
data.append(DataReader(str(item), source, date_from, date_to)
symbol_dataframe=DataFrame(data=data, columns=ls_keys)
#symbol_dataframe = DataFrame()
#for key in ls_keys:
# symbol_data[key] = DataFrame({})
#dataframe_open = DataFrame({})
#for item in symbols:
''' print 'Fetching data for:', item
current_data = DataReader(str(item), source, date_from, date_to)
dataframe_open = {item : current_data['Open']}
#print(dataframe_open)
if len(symbol_data['Open'].columns) == 0:
symbol_data['Open'] = DataFrame(dataframe_open)
else:
#i want to add the new column here but can't seem to do this.**
symbol_data['Open'] = DataFrame(dataframe_open)
symbol_data.head()
'''
return symbol_dataframe

Related

Unable to process large amount of data using for loop

I am downloading 2 years worth of OHLC for 10k symbols and writing it to database. When I try to pull the entire list it crashes (but doesn't if I download 20%):
import config
from alpaca_trade_api.rest import REST, TimeFrame
import sqlite3
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
start_date = (datetime.datetime.now() - relativedelta(years=2)).date()
start_date = pd.Timestamp(start_date, tz='America/New_York').isoformat()
end_date = pd.Timestamp(datetime.datetime.now(), tz='America/New_York').isoformat()
conn = sqlite3.connect('allStockData.db')
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
origin_symbols = pd.read_sql_query("SELECT symbol, name from stock", conn)
df = origin_symbols
df_dict = df.to_dict('records')
startTime = datetime.datetime.now()
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
temp_data = []
for key in df_dict:
symbol = key['symbol']
print(f"downloading ${symbol}")
# stock_id = key['id']
barsets = api.get_bars_iter(symbol, TimeFrame.Day, start_date, end_date)
barsets = list(barsets)
for index, bar in enumerate(barsets):
bars = pd.DataFrame({'date': bar.t.date(), 'symbol': symbol, 'open': bar.o, 'high': bar.h, 'low': bar.l, 'close': bar.c, 'volume': bar.v, 'vwap': bar.vw}, index=[0])
temp_data.append(bars)
print("loop complete")
data = pd.concat(temp_data)
# write df back to sql, replacing the previous table
data.to_sql('daily_ohlc_init', if_exists='replace', con=conn, index=True)
endTime = datetime.datetime.now()
print(f'time elapsed to pull data was {endTime - startTime}')
To make it work I add this line after df_dict to limit symbols downloaded:
df_dict = df_dict[0:2000]
This will allow me to write to database but I need the entire dictionary (about 10k symbols). How do I write to the database without it crashing?
Since you mentioned that you are able to make it work for 2000 records of df_dict at a time, a possible simple approach could be:
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
num_records = len(df_dict)
chunk_size = 2000
num_passes = num_records // chunk_size + int(num_records % chunk_size != 0)
for i in range(num_passes):
start = i * chunk_size
end = min((i + 1) * chunk_size, num_records)
df_chunk = df_dict[start: end]
temp_data = []
for key in df_chunk:
symbol = key['symbol']
print(f"downloading ${symbol}")
barsets = api.get_bars_iter(symbol, TimeFrame.Day, start_date, end_date)
barsets = list(barsets)
for index, bar in enumerate(barsets):
bars = [bar.t.date(), symbol, bar.o, bar.h, bar.l, bar.c, bar.v, bar.vw]
temp_data.append(bars)
# should be a bit more efficient to create a dataframe just once
columns = ['date', 'symbol', 'open', 'high', 'low', 'close', 'volume', 'vwap']
data = pd.DataFrame(temp_data, columns=columns)
# should delete previous table when writing first chunk, then start appending from next passes through df_dict
data.to_sql('daily_ohlc_init', if_exists='replace' if i == 0 else 'append', con=conn, index=True)
print(f"Internal loop finished processing records {start} to {end} out of {num_records}.")
endTime = datetime.datetime.now()
print(f'time elapsed to pull data was {endTime - startTime}')

How to pass a variable as an argument of another function in python

I have a list (chart_list) and I want to call them one by one and plot the chart but I face an error. How can I deal with this problem? I know that they are strings but I don't know how to give it to the tickerDf.
import streamlit as st
import yfinance as yf
import pandas as pd
import datetime
cols = st.columns(2)
# define the ticker symbol
tickerSymbol = cols[0].text_input("Symbol:", 'GOOG')
st.markdown(f'Shown are the **stock closing** price and **volume** of **{tickerSymbol}**')
# get data on this ticker
tickerData = yf.Ticker(tickerSymbol)
period_list = ['1d', '5d']
selected_period = cols[0].selectbox("Period:", period_list)
interval_list = ['1m', '2m', '5m', '15m', '30m', '60m', '90m', '1h', '1d', '5d', '1wk', '1mo', '3mo']
selected_interval = cols[1].selectbox("Interval", interval_list)
today = datetime.date.today()
yesterday = today + datetime.timedelta(days=-10)
start_date = cols[0].date_input('Start date', yesterday)
end_date = cols[1].date_input('End date', today)
if start_date > end_date:
st.error("Error: End date must fall after start date")
# get the historical prices for this ticker
tickerDf = tickerData.history(interval=selected_interval, start=start_date, end=end_date)
# Open High Low Close Volume Dividends Stock Splits
chart_list = ['Open', 'High', 'Low', 'Close', 'Volume']
selected_charts = st.multiselect("Charts", chart_list)
if st.button("Show"):
for chart in chart_list:
st.line_chart(tickerDf.chart)
st.write(f"## {chart}")
You can't index into a dataframe like this (st.line_chart(tickerDf.chart)), as this is a literal specification of the column name.
Try st.line_chart(tickerDf[chart]) instead
The problem is how you are accessing the dataframe in this loop:
for chart in chart_list:
st.line_chart(tickerDf.chart)
st.write(f"## {chart}")
Change it to this:
for chart in selected_charts:
st.write(f"## {chart}")
st.line_chart(tickerDf[chart])
Working code:
import streamlit as st
import yfinance as yf
import pandas as pd
import datetime
cols = st.columns(2)
# define the ticker symbol
tickerSymbol = cols[0].text_input("Symbol:", 'GOOG')
st.markdown(f'Shown are the **stock closing** price and **volume** of **{tickerSymbol}**')
# get data on this ticker
tickerData = yf.Ticker(tickerSymbol)
period_list = ['1d', '5d']
selected_period = cols[0].selectbox("Period:", period_list)
interval_list = ['1m', '2m', '5m', '15m', '30m', '60m', '90m', '1h', '1d', '5d', '1wk', '1mo', '3mo']
selected_interval = cols[1].selectbox("Interval", interval_list)
today = datetime.date.today()
yesterday = today + datetime.timedelta(days=-10)
start_date = cols[0].date_input('Start date', yesterday)
end_date = cols[1].date_input('End date', today)
if start_date > end_date:
st.error("Error: End date must fall after start date")
# get the historical prices for this ticker
tickerDf = tickerData.history(interval=selected_interval, start=start_date, end=end_date)
# Open High Low Close Volume Dividends Stock Splits
chart_list = ['Open', 'High', 'Low', 'Close', 'Volume']
selected_charts = st.multiselect("Charts", chart_list)
if st.button("Show"):
for chart in selected_charts:
st.write(f"## {chart}")
st.line_chart(tickerDf[chart])
Output example:

Python: Calculate VWAP by Date

I'm trying to add to this code a function that would calculate vwap by date, but it isn't working:
def get_ohlc (pair, interval=1, since='last'):
endpoint = 'https://api.kraken.com/0/public/OHLC'
payLoad = {
'pair': pair,
'interval': interval,
'since' : since
}
response = requests.get(endpoint, payLoad)
data = response.json()
OHLC = data['result'][pair]
data = pd.DataFrame.from_records(OHLC, columns=['Time', 'Open', 'High', 'Low', 'Close', 'vwap', 'volume', 'count'])
data['Time'] = pd.to_datetime(data['Time'], unit='s')
data['Date'] = data['Time'].dt.date
data.set_index('Time',inplace=True)
data = data.drop(['vwap', 'count'], axis=1)
data['Open'] = data.Open.astype(float)
data['High'] = data.High.astype(float)
data['Low'] = data.Low.astype(float)
data['Close'] = data.Close.astype(float)
data['volume'] = data.volume.astype(float)
data['Vwap'] = data.groupby('Date', group_keys=False).apply(Vwap)
return data
def Vwap(data):
H = data.High
L = data.Low
C = data.Close
V = data.volume
return data.assign(Vwap = (V * ((H+L+C)/3)).cumsum() / V.cumsum())
I get the following error:
ValueError: Wrong number of items passed 7, placement implies 1
In my view, you have been mixing the "responsibilities" in your code:
the Vwap func should only take care of the calculation bit
you can create the vwap column in the get_ohlc function (btw: that is doing too many things in my view - maybe I would split the download from the manipulation of data).
Anyway, this is how I would write a quick solution to your problem:
import requests
import pandas as pd
def get_ohlc (pair, interval=1, since='last'):
endpoint = 'https://api.kraken.com/0/public/OHLC'
payLoad = {
'pair': pair,
'interval': interval,
'since' : since
}
response = requests.get(endpoint, payLoad)
data = response.json()
OHLC = data['result'][pair]
data = pd.DataFrame.from_records(OHLC, columns=['Time', 'Open', 'High', 'Low', 'Close', 'vwap', 'volume', 'count'])
data['Time'] = pd.to_datetime(data['Time'], unit='s')
data['Date'] = data['Time'].dt.date
data.set_index('Time',inplace=True)
data = data.drop(['vwap', 'count'], axis=1)
data['Open'] = data.Open.astype(float)
data['High'] = data.High.astype(float)
data['Low'] = data.Low.astype(float)
data['Close'] = data.Close.astype(float)
data['volume'] = data.volume.astype(float)
data = data.assign(vwap = data.groupby('Date', group_keys=False).apply(vwap_func))
return data
def vwap_func(data):
H = data["High"]
L = data["Low"]
C = data["Close"]
V = data["volume"]
res = (V * (H+L+C) / 3).cumsum() / V.cumsum()
return res.to_frame()
data = get_ohlc(pair="XXBTZUSD")
print(data)
As you can see, there is no need to call vwap_func at the end, given that it is applied already in your get_ohlc function

how to append data to a list

i m trying to get buy signals by using stockstats macdh values.I can get macdh values and when i print them i can see values. but when i use if status for getting buy signals my buy signals list is empty. there might be something wrong with macdh values type. how can fill buy signals list?
import pandas as pd
import requests
import json
from stockstats import StockDataFrame as Sdf
class TradingModel:
def __init__(self, symbol):
self.symbol = symbol
self.df = self.getData
#property
def getData(self):
base = 'https://api.binance.com'
endpoint = '/api/v3/klines'
params = '?&symbol='+self.symbol+'&interval=4h'
url = base + endpoint + params
Data = requests.get(url)
dictionary = Data.json()
df = pd.DataFrame.from_dict(dictionary)
df = df.drop(range(6, 12), axis=1)
# rename columns and stockstasts
col_names = ['time', 'open', 'high', 'low', 'close', 'volume']
df.columns = col_names
stock = Sdf.retype(df)
for col in col_names:
df[col]=df[col].astype(float)
df['macdh']=stock['macdh']
return (df)
def strategy(self):
df = self.df
buy_signals=[]
for i in range(1, len(df['close'])):
if df['macdh'].iloc[-1]>0 and df['macdh'].iloc[-2]<0:
buy_signals.append([df['time'][i], df['low'][i]])
print(buy_signals)
def Main():
symbol = "BTCUSDT"
model = TradingModel(symbol)
model.strategy()
if __name__ == '__main__':
Main()
On running your code I observed that the values for both df['macdh'].iloc[-1] and df['macdh'].iloc[-2] are negative together in all cases so the condition if df['macdh'].iloc[-1]>0 and df['macdh'].iloc[-2]<0 is never satisfied, but the code for appending the list is correct and will work once the condition is satisfied.
Hope it helps!

Python: invalid syntax: <string>, line 1, pos 16

I have developed a code in Python in which -in order to run the program- I need to take some arguments from the command line. But I am getting continuously the same error:
Traceback (most recent call last):
File "<string>", line 1, in <fragment>
invalid syntax: <string>, line 1, pos 16
I have the faintest idea what is wrong with my code. So, I present my code below in case someone could help me:
import QSTK.qstkutil.qsdateutil as du
import QSTK.qstkutil.tsutil as tsu
import QSTK.qstkutil.DataAccess as da
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import math
import copy
import QSTK.qstkstudy.EventProfiler as ep
import csv
import sys
import argparse
def readData(li_startDate, li_endDate, ls_symbols):
#Create datetime objects for Start and End dates (STL)
dt_start = dt.datetime(li_startDate[0], li_startDate[1], li_startDate[2])
dt_end = dt.datetime(li_endDate[0], li_endDate[1], li_endDate[2])
#Initialize daily timestamp: closing prices, so timestamp should be hours=16 (STL)
dt_timeofday = dt.timedelta(hours=16)
#Get a list of trading days between the start and end dates (QSTK)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
#Create an object of the QSTK-dataaccess class with Yahoo as the source (QSTK)
c_dataobj = da.DataAccess('Yahoo', cachestalltime=0)
#Keys to be read from the data
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
#Read the data and map it to ls_keys via dict() (i.e. Hash Table structure)
ldf_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
d_data = dict(zip(ls_keys, ldf_data))
return [d_data, dt_start, dt_end, dt_timeofday, ldt_timestamps]
def marketsim(cash,orders_file,values_file):
orders = pd.read_csv(orders_file,index_col='Date',parse_dates=True,header=None)
ls_symbols = list(set(orders['X.4'].values))
df_lastrow = len(orders) - 1
dt_start = dt.datetime(orders.get_value(0, 'X.1'),orders.get_value(0, 'X.2'),orders.get_value(0, 'X.3'))
dt_end = dt.datetime(orders.get_value(df_lastrow, 'X.1'),orders.get_value(df_lastrow, 'X.2'),orders.get_value(df_lastrow, 'X.3') + 1 )
#d_data = readData(dt_start,dt_end,ls_symbols)
#Initialize daily timestamp: closing prices, so timestamp should be hours=16 (STL)
dt_timeofday = dt.timedelta(hours=16)
#Get a list of trading days between the start and end dates (QSTK)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
#Create an object of the QSTK-dataaccess class with Yahoo as the source (QSTK)
c_dataobj = da.DataAccess('Yahoo', cachestalltime=0)
#Keys to be read from the data
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
#Read the data and map it to ls_keys via dict() (i.e. Hash Table structure)
df_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
d_data = dict(zip(ls_keys, ldf_data))
ls_symbols.append("_CASH")
trades = pd.Dataframe(index=list(ldt_timestamps[0]),columns=list(ls_symbols))
current_cash = cash
trades["_CASH"][ldt_timestamps[0]] = current_cash
current_stocks = dict()
for symb in ls_symbols:
current_stocks[symb] = 0
trades[symb][ldt_timestamps[0]] = 0
for row in orders.iterrows():
row_data = row[1]
current_date = dt.datetime(row_data['X.1'],row_data['X.2'],row_data['X.3'],16)
symb = row_data['X.4']
stock_value = d_data['close'][symb][current_date]
stock_amount = row_data['X.6']
if row_data['X.5'] == "Buy":
current_cash = current_cash - (stock_value*stock_amount)
trades["_CASH"][current_date] = current_cash
current_stocks[symb] = current_stocks[symb] + stock_amount
trades[symb][current_date] = current_stocks[symb]
else:
current_cash = current_cash + (stock_value*stock_amount)
trades["_CASH"][current_date] = current_cash
current_stocks[symb] = current_stocks[symb] - stock_amount
trades[symb][current_date] = current_stocks[symb]
#trades.fillna(method='ffill',inplace=True)
#trades.fillna(method='bfill',inplace=False)
trades.fillna(0)
#alt_cash = current_cash
#alt_cash = trades.cumsum()
value_data = pd.Dataframe(index=list(ldt_timestamps),columns=list("V"))
value_data = value_data.fillna(0)
value_data = value_data.cumsum(axis=0)
for day in ldt_timestamps:
value = 0
for sym in ls_symbols:
if sym == "_CASH":
value = value + trades[sym][day]
else:
value = calue + trades[sym][day]*d_data['close'][sym][day]
value_data["V"][day] = value
fileout = open(values_file,"w")
for row in value_data.iterrows():
file_out.writelines(str(row[0].strftime('%Y,%m,%d')) + ", " + str(row[1]["V"].round()) + "\n" )
fileout.close()
def main(argv):
if len(sys.argv) != 3:
print "Invalid arguments for marketsim.py. It should be of the following syntax: marketsim.py orders_file.csv values_file.csv"
sys.exit(0)
#initial_cash = int (sys.argv[1])
initial_cash = 1000000
ordersFile = str(sys.argv[1])
valuesFile = str(sys.argv[2])
marketsim(initial_cash,ordersFile,valuesFile)
if __name__ == "__main__":
main(sys.argv[1:])
The input I gave to the command line was:
python marketsim.py orders.csv values.csv
I guess that the problem lies either into the imports or probably into the main function(incl. the if below the def main(argv)
I have to point out that the files orders.csv and values.csv exist and are located into the same folder.
I hope have made everything clear.
So, I am looking forward to reading your answers community-mates! :D
Thank you!

Categories