Creating a class from df.iterrows() is painfully slow - python

I'm wondering if I am doing this the correct way - I am new to Python, and tried to figure this out as best I could, but now I've almost completed my project, part of it is painfully slow.
I have pulled down daily OHLC bars, filtered through them to find the gappers, and then I have created a Class which goes over the minute data for those daily gappers, and returns important information which I use later for my backtests, e.g whether we've hit the 200 ema in the pre market.
Here's what my code looks like:
# get the dates for our gaps
import os.path
import glob
import numpy as np
from pathlib import Path
folder = "daily_bars_filtered/*.csv"
df_gapper_list = []
df_intraday_analysis = []
# loop through the daily gappers
for fname in glob.glob(folder)[:13]:
ticker = Path(fname).stem
df = pd.read_csv(fname)
df['ticker'] = ticker
df_gapper_list.append(df)
print(f'downloading {ticker}')
# get the intraday bars data for the entire dates
file_ = 'intraday_bars_gapped_new/{}.csv'.format(ticker)
df_minute_bars = pd.read_csv(file_)
# for the current stocks daily gappers, anaylise the data and return the data (fades, ohlc, market periods, etc)
for index, row in df.iterrows():
session_data = SESSION_DATA(
# pass in the minute data
df_minute_bars,
# pass in the daily data and ticker
row['open'],
row['high'],
row['low'],
row['close'],
row['date'],
ticker,
row['previous close'],
row['volume']).intraday_data
df_intraday_analysis.append(session_data)
final_df = pd.concat(df_intraday_analysis,ignore_index=True)
display(final_df)
print(f'length of final_df is {len(final_df)}')
final_df.to_csv('mikeys-spreadsheet2222.csv', index=False)
And here's what my class looks like:
import pandas as pd
from datetime import datetime, time
from IPython.display import display
import math
class SESSION_DATA:
def __init__(self, minute_data, open, high, low, close, date, ticker, previous_close, volume):
self.minute_data = minute_data
self.date = date
self.ticker = ticker
self.intraday_data = []
self.open = open
self.high = high
self.low = low
self.close = close
self.previous_close = previous_close
self.volume = volume
df = self.minute_data
df_current_day = df[(df['date'] == self.date)]
df_current_day['time'] = pd.to_datetime(df['time']).dt.time
self.after_hours_high = GET_TIME_PERIOD_DATA('after_hours', df_current_day).high
self.after_hours_runner = bigger_smaller(self.after_hours_high, self.previous_close)
self.pre_market_high = GET_TIME_PERIOD_DATA('pre_market', df_current_day).high
self.pre_market_high_time = GET_TIME_PERIOD_DATA('pre_market', df_current_day).high_time
self.new_gapper = new_gapper(self.after_hours_runner, self.early_pre_market_runner)
self.spike = abs(self.high - self.open)
df_intraday_data = pd.DataFrame({
'date': self.date,
'ticker': self.ticker,
'open': self.open,
'high': self.high,
'low': self.low,
'close': self.close,
'prev close': self.previous_close,
'volume': self.volume,
'PM hi': self.pre_market_high,
'PM hi time': self.pre_market_high_time,
'PM 200 ema hit': HIT_200_EMA('pre_market', df_current_day).hit_200_ema,
'New gapper': self.new_gapper,
'Spike': self.spike,
'Pop over 10%': pop_over_10(self.spike),
}, index=[0])
self.intraday_data = df_intraday_data
Is there a better way of achieving what I am doing, maybe without the use of iterrows or using something like numpy?

Thanks to CodeMonkey for pointing me in the right direction. I didn't test the speed difference but it's huge, so thank you. I will look into trying Zaero's suggestions in the future when I have time:
# get the dates for our gaps
import os.path
import glob
import numpy as np
import pandas as pd
from pathlib import Path
# import time
def SESSION_TEST(open, high, low, close, date, ticker, previous, volume, df):
date = pd.to_datetime(date).strftime('%Y-%m-%d')
current_day = df[df['date'] == date]
session_data = pd.DataFrame({
'date': date,
'ticker': ticker,
'open': open,
'high': high,
'low': low,
'close': close,
'prev close': previous,
'volume': volume,
'hit 200 ema': HIT_200_EMA('pre_market', current_day),
}, index=[0])
return session_data
folder = "daily_bars_filtered/*.csv"
df_intraday_analysis = []
# loop through the daily gappers
for fname in glob.glob(folder)[216:]:
ticker = Path(fname).stem
df = pd.read_csv(fname, parse_dates=['date'])
df['ticker'] = ticker
print(f'getting {ticker}')
# get the intraday bars data for the entire dates
file_ = 'intraday_bars_gapped_new/{}.csv'.format(ticker)
if os.path.exists(file_):
print(f'the {file_} exists')
df_minute_bars = pd.read_csv(file_, parse_dates=['date', 'datetime'])
for row in df.to_dict(orient='records'):
session_data = SESSION_TEST(
# pass in the daily data and ticker
row['open'],
row['high'],
row['low'],
row['close'],
row['date'],
ticker,
row['previous close'],
row['volume'],
# pass in the minute data
df_minute_bars)
df_intraday_analysis.append(session_data)
final_df = pd.concat(df_intraday_analysis,ignore_index=True)
display(final_df)
final_df.to_csv('mikeys-spreadsheet.csv', index=False)
else:
print(f'No such file or directory: {file_}')

Related

Unable to process large amount of data using for loop

I am downloading 2 years worth of OHLC for 10k symbols and writing it to database. When I try to pull the entire list it crashes (but doesn't if I download 20%):
import config
from alpaca_trade_api.rest import REST, TimeFrame
import sqlite3
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
start_date = (datetime.datetime.now() - relativedelta(years=2)).date()
start_date = pd.Timestamp(start_date, tz='America/New_York').isoformat()
end_date = pd.Timestamp(datetime.datetime.now(), tz='America/New_York').isoformat()
conn = sqlite3.connect('allStockData.db')
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
origin_symbols = pd.read_sql_query("SELECT symbol, name from stock", conn)
df = origin_symbols
df_dict = df.to_dict('records')
startTime = datetime.datetime.now()
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
temp_data = []
for key in df_dict:
symbol = key['symbol']
print(f"downloading ${symbol}")
# stock_id = key['id']
barsets = api.get_bars_iter(symbol, TimeFrame.Day, start_date, end_date)
barsets = list(barsets)
for index, bar in enumerate(barsets):
bars = pd.DataFrame({'date': bar.t.date(), 'symbol': symbol, 'open': bar.o, 'high': bar.h, 'low': bar.l, 'close': bar.c, 'volume': bar.v, 'vwap': bar.vw}, index=[0])
temp_data.append(bars)
print("loop complete")
data = pd.concat(temp_data)
# write df back to sql, replacing the previous table
data.to_sql('daily_ohlc_init', if_exists='replace', con=conn, index=True)
endTime = datetime.datetime.now()
print(f'time elapsed to pull data was {endTime - startTime}')
To make it work I add this line after df_dict to limit symbols downloaded:
df_dict = df_dict[0:2000]
This will allow me to write to database but I need the entire dictionary (about 10k symbols). How do I write to the database without it crashing?
Since you mentioned that you are able to make it work for 2000 records of df_dict at a time, a possible simple approach could be:
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
num_records = len(df_dict)
chunk_size = 2000
num_passes = num_records // chunk_size + int(num_records % chunk_size != 0)
for i in range(num_passes):
start = i * chunk_size
end = min((i + 1) * chunk_size, num_records)
df_chunk = df_dict[start: end]
temp_data = []
for key in df_chunk:
symbol = key['symbol']
print(f"downloading ${symbol}")
barsets = api.get_bars_iter(symbol, TimeFrame.Day, start_date, end_date)
barsets = list(barsets)
for index, bar in enumerate(barsets):
bars = [bar.t.date(), symbol, bar.o, bar.h, bar.l, bar.c, bar.v, bar.vw]
temp_data.append(bars)
# should be a bit more efficient to create a dataframe just once
columns = ['date', 'symbol', 'open', 'high', 'low', 'close', 'volume', 'vwap']
data = pd.DataFrame(temp_data, columns=columns)
# should delete previous table when writing first chunk, then start appending from next passes through df_dict
data.to_sql('daily_ohlc_init', if_exists='replace' if i == 0 else 'append', con=conn, index=True)
print(f"Internal loop finished processing records {start} to {end} out of {num_records}.")
endTime = datetime.datetime.now()
print(f'time elapsed to pull data was {endTime - startTime}')

Pandas how to search one df for a certain date and return that data

I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')
You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.

How to pass a variable as an argument of another function in python

I have a list (chart_list) and I want to call them one by one and plot the chart but I face an error. How can I deal with this problem? I know that they are strings but I don't know how to give it to the tickerDf.
import streamlit as st
import yfinance as yf
import pandas as pd
import datetime
cols = st.columns(2)
# define the ticker symbol
tickerSymbol = cols[0].text_input("Symbol:", 'GOOG')
st.markdown(f'Shown are the **stock closing** price and **volume** of **{tickerSymbol}**')
# get data on this ticker
tickerData = yf.Ticker(tickerSymbol)
period_list = ['1d', '5d']
selected_period = cols[0].selectbox("Period:", period_list)
interval_list = ['1m', '2m', '5m', '15m', '30m', '60m', '90m', '1h', '1d', '5d', '1wk', '1mo', '3mo']
selected_interval = cols[1].selectbox("Interval", interval_list)
today = datetime.date.today()
yesterday = today + datetime.timedelta(days=-10)
start_date = cols[0].date_input('Start date', yesterday)
end_date = cols[1].date_input('End date', today)
if start_date > end_date:
st.error("Error: End date must fall after start date")
# get the historical prices for this ticker
tickerDf = tickerData.history(interval=selected_interval, start=start_date, end=end_date)
# Open High Low Close Volume Dividends Stock Splits
chart_list = ['Open', 'High', 'Low', 'Close', 'Volume']
selected_charts = st.multiselect("Charts", chart_list)
if st.button("Show"):
for chart in chart_list:
st.line_chart(tickerDf.chart)
st.write(f"## {chart}")
You can't index into a dataframe like this (st.line_chart(tickerDf.chart)), as this is a literal specification of the column name.
Try st.line_chart(tickerDf[chart]) instead
The problem is how you are accessing the dataframe in this loop:
for chart in chart_list:
st.line_chart(tickerDf.chart)
st.write(f"## {chart}")
Change it to this:
for chart in selected_charts:
st.write(f"## {chart}")
st.line_chart(tickerDf[chart])
Working code:
import streamlit as st
import yfinance as yf
import pandas as pd
import datetime
cols = st.columns(2)
# define the ticker symbol
tickerSymbol = cols[0].text_input("Symbol:", 'GOOG')
st.markdown(f'Shown are the **stock closing** price and **volume** of **{tickerSymbol}**')
# get data on this ticker
tickerData = yf.Ticker(tickerSymbol)
period_list = ['1d', '5d']
selected_period = cols[0].selectbox("Period:", period_list)
interval_list = ['1m', '2m', '5m', '15m', '30m', '60m', '90m', '1h', '1d', '5d', '1wk', '1mo', '3mo']
selected_interval = cols[1].selectbox("Interval", interval_list)
today = datetime.date.today()
yesterday = today + datetime.timedelta(days=-10)
start_date = cols[0].date_input('Start date', yesterday)
end_date = cols[1].date_input('End date', today)
if start_date > end_date:
st.error("Error: End date must fall after start date")
# get the historical prices for this ticker
tickerDf = tickerData.history(interval=selected_interval, start=start_date, end=end_date)
# Open High Low Close Volume Dividends Stock Splits
chart_list = ['Open', 'High', 'Low', 'Close', 'Volume']
selected_charts = st.multiselect("Charts", chart_list)
if st.button("Show"):
for chart in selected_charts:
st.write(f"## {chart}")
st.line_chart(tickerDf[chart])
Output example:

How do I get all the prices history with binance API for a crypto using Python?

I've been using this script to get the prices from some cryptocurrencies using Binance API and this script:
https://steemit.com/python/#marketstack/how-to-download-historical-price-data-from-binance-with-python
The problem is that with this script I cannot control the date range: for example, I want to choose the period range between Dec. 2015 and Dec. 2020, or I want the DAILY PRICES from the first day trading for any crypto ...etc.
So I share with you the code I'm using (copied from the steemit code and modified a little bit)
How can I do it?
# https://steemit.com/python/#marketstack/how-to-download-historical-price-data-from-binance-with-python###
import requests
import json
import pandas as pd
import numpy as np
import datetime as dt
frequency = input("Please enter the frequency (1m/5m/30m/.../1h/6h/1d/ : ")
def get_bars(symbol, interval=frequency):
root_url = 'https://api.binance.com/api/v1/klines'
url = root_url + '?symbol=' + symbol + '&interval=' + interval
data = json.loads(requests.get(url).text)
df = pd.DataFrame(data)
df.columns = ['open_time',
'o', 'h', 'l', 'c', 'v',
'close_time', 'qav', 'num_trades',
'taker_base_vol', 'taker_quote_vol', 'ignore']
df.index = [dt.datetime.fromtimestamp(x / 1000.0) for x in df.close_time]
return df
btcusdt = get_bars('BTCUSDT')
ethusdt = get_bars('ETHUSDT')
df0=pd.DataFrame(btcusdt)
df0.to_csv('_btcusdt.csv')
df1=pd.DataFrame(ethusdt)
df1.to_csv('_ethusdt.csv')
Can anyone help me to optimize it?
I am using this out of the binance documentation : https://python-binance.readthedocs.io/en/latest/binance.html?highlight=get_historical_klines#binance.client.Client.get_historical_klines
import os
from binance.client import Client
import pandas as pd
import datetime, time
def GetHistoricalData(self, howLong):
self.howLong = howLong
# Calculate the timestamps for the binance api function
self.untilThisDate = datetime.datetime.now()
self.sinceThisDate = self.untilThisDate - datetime.timedelta(days = self.howLong)
# Execute the query from binance - timestamps must be converted to strings !
self.candle = self.client.get_historical_klines("BNBBTC", Client.KLINE_INTERVAL_1MINUTE, str(self.sinceThisDate), str(self.untilThisDate))
# Create a dataframe to label all the columns returned by binance so we work with them later.
self.df = pd.DataFrame(self.candle, columns=['dateTime', 'open', 'high', 'low', 'close', 'volume', 'closeTime', 'quoteAssetVolume', 'numberOfTrades', 'takerBuyBaseVol', 'takerBuyQuoteVol', 'ignore'])
# as timestamp is returned in ms, let us convert this back to proper timestamps.
self.df.dateTime = pd.to_datetime(self.df.dateTime, unit='ms').dt.strftime(Constants.DateTimeFormat)
self.df.set_index('dateTime', inplace=True)
# Get rid of columns we do not need
self.df = self.df.drop(['closeTime', 'quoteAssetVolume', 'numberOfTrades', 'takerBuyBaseVol','takerBuyQuoteVol', 'ignore'], axis=1)
print(self.df)
I do hope this helps someone.
(Please note this method is cut out of a class I have, so you may get rid of all of the self-s) , and you need to have your client set up before by
client = Client(api_key, api_secret)
Any improvements are of course welcome !
This is a function that I used.
Start and end are dates in Unix timestamp format. Interval is graph interval.
And keep in mind Binance did not exist in Dec 2015 :-)
def get_klines_iter(symbol, interval, start, end, limit=5000):
df = pd.DataFrame()
startDate = end
while startDate>start:
url = 'https://api.binance.com/api/v3/klines?symbol=' + \
symbol + '&interval=' + interval + '&limit=' + str(iteration)
if startDate is not None:
url += '&endTime=' + str(startDate)
df2 = pd.read_json(url)
df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime', 'Quote asset volume', 'Number of trades','Taker by base', 'Taker buy quote', 'Ignore']
df = pd.concat([df2, df], axis=0, ignore_index=True, keys=None)
startDate = df.Opentime[0]
df.reset_index(drop=True, inplace=True)
return df
from datetime import datetime
import pandas as pd
import requests
from typing import *
import time
class BinanceClient:
def __init__(self, futures=False):
self.exchange = "BINANCE"
self.futures = futures
if self.futures:
self._base_url = "https://fapi.binance.com"
else:
self._base_url = "https://api.binance.com"
self.symbols = self._get_symbols()
def _make_request(self, endpoint: str, query_parameters: Dict):
try:
response = requests.get(self._base_url + endpoint, params=query_parameters)
except Exception as e:
print("Connection error while making request to %s: %s", endpoint, e)
return None
if response.status_code == 200:
return response.json()
else:
print("Error while making request to %s: %s (status code = %s)",
endpoint, response.json(), response.status_code)
return None
def _get_symbols(self) -> List[str]:
params = dict()
endpoint = "/fapi/v1/exchangeInfo" if self.futures else "/api/v3/exchangeInfo"
data = self._make_request(endpoint, params)
symbols = [x["symbol"] for x in data["symbols"]]
return symbols
def get_historical_data(self, symbol: str, interval: Optional[str] = "1m", start_time: Optional[int] = None, end_time: Optional[int] = None, limit: Optional[int] = 1500):
params = dict()
params["symbol"] = symbol
params["interval"] = interval
params["limit"] = limit
if start_time is not None:
params["startTime"] = start_time
if end_time is not None:
params["endTime"] = end_time
endpoint = "/fapi/v1/klines" if self.futures else "/api/v3/klines"
raw_candles = self._make_request(endpoint, params)
candles = []
if raw_candles is not None:
for c in raw_candles:
candles.append((float(c[0]), float(c[1]), float(c[2]), float(c[3]), float(c[4]), float(c[5]),))
return candles
else:
return None
def ms_to_dt_utc(ms: int) -> datetime:
return datetime.utcfromtimestamp(ms / 1000)
def ms_to_dt_local(ms: int) -> datetime:
return datetime.fromtimestamp(ms / 1000)
def GetDataFrame(data):
df = pd.DataFrame(data, columns=['Timestamp', "Open", "High", "Low", "Close", "Volume"])
df["Timestamp"] = df["Timestamp"].apply(lambda x: ms_to_dt_local(x))
df['Date'] = df["Timestamp"].dt.strftime("%d/%m/%Y")
df['Time'] = df["Timestamp"].dt.strftime("%H:%M:%S")
column_names = ["Date", "Time", "Open", "High", "Low", "Close", "Volume"]
df = df.set_index('Timestamp')
df = df.reindex(columns=column_names)
return df
def GetHistoricalData(client, symbol, start_time, end_time, limit=1500):
collection = []
while start_time < end_time:
data = client.get_historical_data(symbol, start_time=start_time, end_time=end_time, limit=limit)
print(client.exchange + " " + symbol + " : Collected " + str(len(data)) + " initial data from "+ str(ms_to_dt_local(data[0][0])) +" to " + str(ms_to_dt_local(data[-1][0])))
start_time = int(data[-1][0] + 1000)
collection +=data
time.sleep(1.1)
return collection
client = BinanceClient(futures=False)
symbol = "BTCUSDT"
interval = "1m"
fromDate = int(datetime.strptime('2021-11-15', '%Y-%m-%d').timestamp() * 1000)
toDate = int(datetime.strptime('2021-11-16', '%Y-%m-%d').timestamp() * 1000)
data = GetHistoricalData(client, symbol, fromDate, toDate)
df = GetDataFrame(data)
df
based on Mike Malyi and isnvi23h4's answer:
Please use python >= 3.7, the code does not need to install any dependencies
import pandas as pd
from datetime import datetime, timezone, timedelta
import calendar
def get_klines_iter(symbol, interval, start, end = None, limit=1000):
# start and end must be isoformat YYYY-MM-DD
# We are using utc time zone
# the maximum records is 1000 per each Binance API call
df = pd.DataFrame()
if start is None:
print('start time must not be None')
return
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
if end is None:
dt = datetime.now(timezone.utc)
utc_time = dt.replace(tzinfo=timezone.utc)
end = int(utc_time.timestamp()) * 1000
return
else:
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
last_time = None
while len(df) == 0 or (last_time is not None and last_time < end):
url = 'https://api.binance.com/api/v3/klines?symbol=' + \
symbol + '&interval=' + interval + '&limit=1000'
if(len(df) == 0):
url += '&startTime=' + str(start)
else:
url += '&startTime=' + str(last_time)
url += '&endTime=' + str(end)
df2 = pd.read_json(url)
df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']
dftmp = pd.DataFrame()
dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
dftmp['Date'] = dftmp.Opentime.dt.strftime("%d/%m/%Y")
dftmp['Time'] = dftmp.Opentime.dt.strftime("%H:%M:%S")
dftmp = dftmp.drop(['Quote asset volume', 'Closetime', 'Opentime',
'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
column_names = ["Date", "Time", "Open", "High", "Low", "Close", "Volume"]
dftmp.reset_index(drop=True, inplace=True)
dftmp = dftmp.reindex(columns=column_names)
string_dt = str(dftmp['Date'][len(dftmp) - 1]) + 'T' + str(dftmp['Time'][len(dftmp) - 1]) + '.000Z'
utc_last_time = datetime.strptime(string_dt, "%d/%m/%YT%H:%M:%S.%fZ")
last_time = (utc_last_time - datetime(1970, 1, 1)) // timedelta(milliseconds=1)
df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)
df.to_csv('0y_eth_only17andnew.csv', sep='\t', index=False)
get_klines_iter('ETHBUSD', '30m', '2022-01-01', '2022-02-21')
I do it like this:
def get_binance_data(api_key, pair, countdown='open', interval='4h', start='1 Jan 2018', end=None):
client = Client(api_key=api_key)
intervals = {
'15m': Client.KLINE_INTERVAL_15MINUTE,
'1h': Client.KLINE_INTERVAL_1HOUR,
'4h': Client.KLINE_INTERVAL_4HOUR,
'1d': Client.KLINE_INTERVAL_1DAY
}
interval = intervals.get(interval, '4h')
print(f'Historical interval {interval}')
klines = client.get_historical_klines(symbol=pair, interval=interval, start_str=start, end_str=end)
data = pd.DataFrame(klines)
data.columns = ['open_time','open', 'high', 'low', 'close', 'volume','close_time', 'qav','num_trades','taker_base_vol','taker_quote_vol', 'ignore']
data.index = [pd.to_datetime(x, unit='ms').strftime('%Y-%m-%d %H:%M:%S') for x in data.open_time]
usecols=['open', 'high', 'low', 'close', 'volume']
data = data[usecols]
data = data.astype('float')
return data
api_key = 'хххх...xxx' # use your api-key
symbol = 'ETHUSDT'
eth = get_binance_data(api_key, symbol)
eth.head()
Output:
Historical interval 4h
open high low close volume
2018-01-01 00:00:00 733.01 737.99 716.80 734.50 8739.23361
2018-01-01 04:00:00 734.99 763.55 730.01 751.99 9492.34734
2018-01-01 08:00:00 751.77 759.00 730.58 741.01 8939.36851
2018-01-01 12:00:00 741.01 752.27 724.15 748.80 11284.08664
2018-01-01 16:00:00 748.27 749.98 733.00 746.23 7757.00362
import requests
market = 'ETHEUR'
tick_interval = '1d'
url = 'https://api.binance.com/api/v3/klines?symbol='+market+'&interval='+tick_interval
data = requests.get(url).json()
print(data)

dataframe append new series column with data

I have a Panda DataFrame structure and I want to add another column to it, but I can't do it with append, add or insert.
I'm trying to replicate the portfolio data with the Panda's built-in function, because this script doesn't give me correct data if the period that I request is lower than ~ 1,5 years while data must be obtained even for two days if I want. So here's the script that I want to rewrite:
import QSTK.qstkutil.qsdateutil as du
import QSTK.qstkutil.tsutil as tsu
import QSTK.qstkutil.DataAccess as da
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
ls_symbols = ["AAPL", "GLD", "GOOG", "$SPX", "XOM"]
dt_start = dt.datetime(2006, 1, 1)
dt_end = dt.datetime(2010, 12, 31)
dt_timeofday = dt.timedelta(hours=16)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
c_dataobj = da.DataAccess('Yahoo')
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
ldf_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
**d_data = dict(zip(ls_keys, ldf_data))**
d_data = dict(zip(ls_keys, ldf_data)) is what I want to replicate because it doesn't fetch the data that I want, but I need to figure out a way to append a new column to my dict. Here is my script:
from pandas.io.data import DataReader, DataFrame
import QSTK.qstkutil.qsdateutil as du
import QSTK.qstkutil.DataAccess as da
import datetime as dt
def get_historical_data(symbol, source, date_from, date_to):
global data_validator
symbol_data = {}
ls_keys = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
for key in ls_keys:
symbol_data[key] = DataFrame({})
dataframe_open = DataFrame({})
for item in symbol:
print 'Fetching data for:', item
current_data = DataReader(str(item), source, date_from, date_to)
dataframe_open = {item : current_data['Open']}
if len(symbol_data['Open'].columns) == 0:
symbol_data['Open'] = DataFrame(dataframe_open)
else:
**#i want to add the new column here but can't seem to do this.**
#symbol_data['Open'].loc[:item] = DataFrame(dataframe_open)
pass
return symbol_data
P.S. I call the func with these parameters for testing purposes:
test = get_historical_data(['SPY', 'DIA'], 'yahoo', datetime(2015,1,1), datetime(2015,1,31))
Does the following help? Have not tested yet, but should work in principle... Just put the data in arrays of equal length and construct the data frame from that.
def get_historical_data(symbols=[], source=None, date_from=None, date_to=None):
global data_validator
symbol_data = {}
ls_keys = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
data = []
for item in ls_keys:
data.append(DataReader(str(item), source, date_from, date_to)
symbol_dataframe=DataFrame(data=data, columns=ls_keys)
#symbol_dataframe = DataFrame()
#for key in ls_keys:
# symbol_data[key] = DataFrame({})
#dataframe_open = DataFrame({})
#for item in symbols:
''' print 'Fetching data for:', item
current_data = DataReader(str(item), source, date_from, date_to)
dataframe_open = {item : current_data['Open']}
#print(dataframe_open)
if len(symbol_data['Open'].columns) == 0:
symbol_data['Open'] = DataFrame(dataframe_open)
else:
#i want to add the new column here but can't seem to do this.**
symbol_data['Open'] = DataFrame(dataframe_open)
symbol_data.head()
'''
return symbol_dataframe

Categories