Unable to process large amount of data using for loop - python

I am downloading 2 years worth of OHLC for 10k symbols and writing it to database. When I try to pull the entire list it crashes (but doesn't if I download 20%):
import config
from alpaca_trade_api.rest import REST, TimeFrame
import sqlite3
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
start_date = (datetime.datetime.now() - relativedelta(years=2)).date()
start_date = pd.Timestamp(start_date, tz='America/New_York').isoformat()
end_date = pd.Timestamp(datetime.datetime.now(), tz='America/New_York').isoformat()
conn = sqlite3.connect('allStockData.db')
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
origin_symbols = pd.read_sql_query("SELECT symbol, name from stock", conn)
df = origin_symbols
df_dict = df.to_dict('records')
startTime = datetime.datetime.now()
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
temp_data = []
for key in df_dict:
symbol = key['symbol']
print(f"downloading ${symbol}")
# stock_id = key['id']
barsets = api.get_bars_iter(symbol, TimeFrame.Day, start_date, end_date)
barsets = list(barsets)
for index, bar in enumerate(barsets):
bars = pd.DataFrame({'date': bar.t.date(), 'symbol': symbol, 'open': bar.o, 'high': bar.h, 'low': bar.l, 'close': bar.c, 'volume': bar.v, 'vwap': bar.vw}, index=[0])
temp_data.append(bars)
print("loop complete")
data = pd.concat(temp_data)
# write df back to sql, replacing the previous table
data.to_sql('daily_ohlc_init', if_exists='replace', con=conn, index=True)
endTime = datetime.datetime.now()
print(f'time elapsed to pull data was {endTime - startTime}')
To make it work I add this line after df_dict to limit symbols downloaded:
df_dict = df_dict[0:2000]
This will allow me to write to database but I need the entire dictionary (about 10k symbols). How do I write to the database without it crashing?

Since you mentioned that you are able to make it work for 2000 records of df_dict at a time, a possible simple approach could be:
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
num_records = len(df_dict)
chunk_size = 2000
num_passes = num_records // chunk_size + int(num_records % chunk_size != 0)
for i in range(num_passes):
start = i * chunk_size
end = min((i + 1) * chunk_size, num_records)
df_chunk = df_dict[start: end]
temp_data = []
for key in df_chunk:
symbol = key['symbol']
print(f"downloading ${symbol}")
barsets = api.get_bars_iter(symbol, TimeFrame.Day, start_date, end_date)
barsets = list(barsets)
for index, bar in enumerate(barsets):
bars = [bar.t.date(), symbol, bar.o, bar.h, bar.l, bar.c, bar.v, bar.vw]
temp_data.append(bars)
# should be a bit more efficient to create a dataframe just once
columns = ['date', 'symbol', 'open', 'high', 'low', 'close', 'volume', 'vwap']
data = pd.DataFrame(temp_data, columns=columns)
# should delete previous table when writing first chunk, then start appending from next passes through df_dict
data.to_sql('daily_ohlc_init', if_exists='replace' if i == 0 else 'append', con=conn, index=True)
print(f"Internal loop finished processing records {start} to {end} out of {num_records}.")
endTime = datetime.datetime.now()
print(f'time elapsed to pull data was {endTime - startTime}')

Related

Creating a class from df.iterrows() is painfully slow

I'm wondering if I am doing this the correct way - I am new to Python, and tried to figure this out as best I could, but now I've almost completed my project, part of it is painfully slow.
I have pulled down daily OHLC bars, filtered through them to find the gappers, and then I have created a Class which goes over the minute data for those daily gappers, and returns important information which I use later for my backtests, e.g whether we've hit the 200 ema in the pre market.
Here's what my code looks like:
# get the dates for our gaps
import os.path
import glob
import numpy as np
from pathlib import Path
folder = "daily_bars_filtered/*.csv"
df_gapper_list = []
df_intraday_analysis = []
# loop through the daily gappers
for fname in glob.glob(folder)[:13]:
ticker = Path(fname).stem
df = pd.read_csv(fname)
df['ticker'] = ticker
df_gapper_list.append(df)
print(f'downloading {ticker}')
# get the intraday bars data for the entire dates
file_ = 'intraday_bars_gapped_new/{}.csv'.format(ticker)
df_minute_bars = pd.read_csv(file_)
# for the current stocks daily gappers, anaylise the data and return the data (fades, ohlc, market periods, etc)
for index, row in df.iterrows():
session_data = SESSION_DATA(
# pass in the minute data
df_minute_bars,
# pass in the daily data and ticker
row['open'],
row['high'],
row['low'],
row['close'],
row['date'],
ticker,
row['previous close'],
row['volume']).intraday_data
df_intraday_analysis.append(session_data)
final_df = pd.concat(df_intraday_analysis,ignore_index=True)
display(final_df)
print(f'length of final_df is {len(final_df)}')
final_df.to_csv('mikeys-spreadsheet2222.csv', index=False)
And here's what my class looks like:
import pandas as pd
from datetime import datetime, time
from IPython.display import display
import math
class SESSION_DATA:
def __init__(self, minute_data, open, high, low, close, date, ticker, previous_close, volume):
self.minute_data = minute_data
self.date = date
self.ticker = ticker
self.intraday_data = []
self.open = open
self.high = high
self.low = low
self.close = close
self.previous_close = previous_close
self.volume = volume
df = self.minute_data
df_current_day = df[(df['date'] == self.date)]
df_current_day['time'] = pd.to_datetime(df['time']).dt.time
self.after_hours_high = GET_TIME_PERIOD_DATA('after_hours', df_current_day).high
self.after_hours_runner = bigger_smaller(self.after_hours_high, self.previous_close)
self.pre_market_high = GET_TIME_PERIOD_DATA('pre_market', df_current_day).high
self.pre_market_high_time = GET_TIME_PERIOD_DATA('pre_market', df_current_day).high_time
self.new_gapper = new_gapper(self.after_hours_runner, self.early_pre_market_runner)
self.spike = abs(self.high - self.open)
df_intraday_data = pd.DataFrame({
'date': self.date,
'ticker': self.ticker,
'open': self.open,
'high': self.high,
'low': self.low,
'close': self.close,
'prev close': self.previous_close,
'volume': self.volume,
'PM hi': self.pre_market_high,
'PM hi time': self.pre_market_high_time,
'PM 200 ema hit': HIT_200_EMA('pre_market', df_current_day).hit_200_ema,
'New gapper': self.new_gapper,
'Spike': self.spike,
'Pop over 10%': pop_over_10(self.spike),
}, index=[0])
self.intraday_data = df_intraday_data
Is there a better way of achieving what I am doing, maybe without the use of iterrows or using something like numpy?
Thanks to CodeMonkey for pointing me in the right direction. I didn't test the speed difference but it's huge, so thank you. I will look into trying Zaero's suggestions in the future when I have time:
# get the dates for our gaps
import os.path
import glob
import numpy as np
import pandas as pd
from pathlib import Path
# import time
def SESSION_TEST(open, high, low, close, date, ticker, previous, volume, df):
date = pd.to_datetime(date).strftime('%Y-%m-%d')
current_day = df[df['date'] == date]
session_data = pd.DataFrame({
'date': date,
'ticker': ticker,
'open': open,
'high': high,
'low': low,
'close': close,
'prev close': previous,
'volume': volume,
'hit 200 ema': HIT_200_EMA('pre_market', current_day),
}, index=[0])
return session_data
folder = "daily_bars_filtered/*.csv"
df_intraday_analysis = []
# loop through the daily gappers
for fname in glob.glob(folder)[216:]:
ticker = Path(fname).stem
df = pd.read_csv(fname, parse_dates=['date'])
df['ticker'] = ticker
print(f'getting {ticker}')
# get the intraday bars data for the entire dates
file_ = 'intraday_bars_gapped_new/{}.csv'.format(ticker)
if os.path.exists(file_):
print(f'the {file_} exists')
df_minute_bars = pd.read_csv(file_, parse_dates=['date', 'datetime'])
for row in df.to_dict(orient='records'):
session_data = SESSION_TEST(
# pass in the daily data and ticker
row['open'],
row['high'],
row['low'],
row['close'],
row['date'],
ticker,
row['previous close'],
row['volume'],
# pass in the minute data
df_minute_bars)
df_intraday_analysis.append(session_data)
final_df = pd.concat(df_intraday_analysis,ignore_index=True)
display(final_df)
final_df.to_csv('mikeys-spreadsheet.csv', index=False)
else:
print(f'No such file or directory: {file_}')

Python: Calculate VWAP by Date

I'm trying to add to this code a function that would calculate vwap by date, but it isn't working:
def get_ohlc (pair, interval=1, since='last'):
endpoint = 'https://api.kraken.com/0/public/OHLC'
payLoad = {
'pair': pair,
'interval': interval,
'since' : since
}
response = requests.get(endpoint, payLoad)
data = response.json()
OHLC = data['result'][pair]
data = pd.DataFrame.from_records(OHLC, columns=['Time', 'Open', 'High', 'Low', 'Close', 'vwap', 'volume', 'count'])
data['Time'] = pd.to_datetime(data['Time'], unit='s')
data['Date'] = data['Time'].dt.date
data.set_index('Time',inplace=True)
data = data.drop(['vwap', 'count'], axis=1)
data['Open'] = data.Open.astype(float)
data['High'] = data.High.astype(float)
data['Low'] = data.Low.astype(float)
data['Close'] = data.Close.astype(float)
data['volume'] = data.volume.astype(float)
data['Vwap'] = data.groupby('Date', group_keys=False).apply(Vwap)
return data
def Vwap(data):
H = data.High
L = data.Low
C = data.Close
V = data.volume
return data.assign(Vwap = (V * ((H+L+C)/3)).cumsum() / V.cumsum())
I get the following error:
ValueError: Wrong number of items passed 7, placement implies 1
In my view, you have been mixing the "responsibilities" in your code:
the Vwap func should only take care of the calculation bit
you can create the vwap column in the get_ohlc function (btw: that is doing too many things in my view - maybe I would split the download from the manipulation of data).
Anyway, this is how I would write a quick solution to your problem:
import requests
import pandas as pd
def get_ohlc (pair, interval=1, since='last'):
endpoint = 'https://api.kraken.com/0/public/OHLC'
payLoad = {
'pair': pair,
'interval': interval,
'since' : since
}
response = requests.get(endpoint, payLoad)
data = response.json()
OHLC = data['result'][pair]
data = pd.DataFrame.from_records(OHLC, columns=['Time', 'Open', 'High', 'Low', 'Close', 'vwap', 'volume', 'count'])
data['Time'] = pd.to_datetime(data['Time'], unit='s')
data['Date'] = data['Time'].dt.date
data.set_index('Time',inplace=True)
data = data.drop(['vwap', 'count'], axis=1)
data['Open'] = data.Open.astype(float)
data['High'] = data.High.astype(float)
data['Low'] = data.Low.astype(float)
data['Close'] = data.Close.astype(float)
data['volume'] = data.volume.astype(float)
data = data.assign(vwap = data.groupby('Date', group_keys=False).apply(vwap_func))
return data
def vwap_func(data):
H = data["High"]
L = data["Low"]
C = data["Close"]
V = data["volume"]
res = (V * (H+L+C) / 3).cumsum() / V.cumsum()
return res.to_frame()
data = get_ohlc(pair="XXBTZUSD")
print(data)
As you can see, there is no need to call vwap_func at the end, given that it is applied already in your get_ohlc function

How do I get all the prices history with binance API for a crypto using Python?

I've been using this script to get the prices from some cryptocurrencies using Binance API and this script:
https://steemit.com/python/#marketstack/how-to-download-historical-price-data-from-binance-with-python
The problem is that with this script I cannot control the date range: for example, I want to choose the period range between Dec. 2015 and Dec. 2020, or I want the DAILY PRICES from the first day trading for any crypto ...etc.
So I share with you the code I'm using (copied from the steemit code and modified a little bit)
How can I do it?
# https://steemit.com/python/#marketstack/how-to-download-historical-price-data-from-binance-with-python###
import requests
import json
import pandas as pd
import numpy as np
import datetime as dt
frequency = input("Please enter the frequency (1m/5m/30m/.../1h/6h/1d/ : ")
def get_bars(symbol, interval=frequency):
root_url = 'https://api.binance.com/api/v1/klines'
url = root_url + '?symbol=' + symbol + '&interval=' + interval
data = json.loads(requests.get(url).text)
df = pd.DataFrame(data)
df.columns = ['open_time',
'o', 'h', 'l', 'c', 'v',
'close_time', 'qav', 'num_trades',
'taker_base_vol', 'taker_quote_vol', 'ignore']
df.index = [dt.datetime.fromtimestamp(x / 1000.0) for x in df.close_time]
return df
btcusdt = get_bars('BTCUSDT')
ethusdt = get_bars('ETHUSDT')
df0=pd.DataFrame(btcusdt)
df0.to_csv('_btcusdt.csv')
df1=pd.DataFrame(ethusdt)
df1.to_csv('_ethusdt.csv')
Can anyone help me to optimize it?
I am using this out of the binance documentation : https://python-binance.readthedocs.io/en/latest/binance.html?highlight=get_historical_klines#binance.client.Client.get_historical_klines
import os
from binance.client import Client
import pandas as pd
import datetime, time
def GetHistoricalData(self, howLong):
self.howLong = howLong
# Calculate the timestamps for the binance api function
self.untilThisDate = datetime.datetime.now()
self.sinceThisDate = self.untilThisDate - datetime.timedelta(days = self.howLong)
# Execute the query from binance - timestamps must be converted to strings !
self.candle = self.client.get_historical_klines("BNBBTC", Client.KLINE_INTERVAL_1MINUTE, str(self.sinceThisDate), str(self.untilThisDate))
# Create a dataframe to label all the columns returned by binance so we work with them later.
self.df = pd.DataFrame(self.candle, columns=['dateTime', 'open', 'high', 'low', 'close', 'volume', 'closeTime', 'quoteAssetVolume', 'numberOfTrades', 'takerBuyBaseVol', 'takerBuyQuoteVol', 'ignore'])
# as timestamp is returned in ms, let us convert this back to proper timestamps.
self.df.dateTime = pd.to_datetime(self.df.dateTime, unit='ms').dt.strftime(Constants.DateTimeFormat)
self.df.set_index('dateTime', inplace=True)
# Get rid of columns we do not need
self.df = self.df.drop(['closeTime', 'quoteAssetVolume', 'numberOfTrades', 'takerBuyBaseVol','takerBuyQuoteVol', 'ignore'], axis=1)
print(self.df)
I do hope this helps someone.
(Please note this method is cut out of a class I have, so you may get rid of all of the self-s) , and you need to have your client set up before by
client = Client(api_key, api_secret)
Any improvements are of course welcome !
This is a function that I used.
Start and end are dates in Unix timestamp format. Interval is graph interval.
And keep in mind Binance did not exist in Dec 2015 :-)
def get_klines_iter(symbol, interval, start, end, limit=5000):
df = pd.DataFrame()
startDate = end
while startDate>start:
url = 'https://api.binance.com/api/v3/klines?symbol=' + \
symbol + '&interval=' + interval + '&limit=' + str(iteration)
if startDate is not None:
url += '&endTime=' + str(startDate)
df2 = pd.read_json(url)
df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime', 'Quote asset volume', 'Number of trades','Taker by base', 'Taker buy quote', 'Ignore']
df = pd.concat([df2, df], axis=0, ignore_index=True, keys=None)
startDate = df.Opentime[0]
df.reset_index(drop=True, inplace=True)
return df
from datetime import datetime
import pandas as pd
import requests
from typing import *
import time
class BinanceClient:
def __init__(self, futures=False):
self.exchange = "BINANCE"
self.futures = futures
if self.futures:
self._base_url = "https://fapi.binance.com"
else:
self._base_url = "https://api.binance.com"
self.symbols = self._get_symbols()
def _make_request(self, endpoint: str, query_parameters: Dict):
try:
response = requests.get(self._base_url + endpoint, params=query_parameters)
except Exception as e:
print("Connection error while making request to %s: %s", endpoint, e)
return None
if response.status_code == 200:
return response.json()
else:
print("Error while making request to %s: %s (status code = %s)",
endpoint, response.json(), response.status_code)
return None
def _get_symbols(self) -> List[str]:
params = dict()
endpoint = "/fapi/v1/exchangeInfo" if self.futures else "/api/v3/exchangeInfo"
data = self._make_request(endpoint, params)
symbols = [x["symbol"] for x in data["symbols"]]
return symbols
def get_historical_data(self, symbol: str, interval: Optional[str] = "1m", start_time: Optional[int] = None, end_time: Optional[int] = None, limit: Optional[int] = 1500):
params = dict()
params["symbol"] = symbol
params["interval"] = interval
params["limit"] = limit
if start_time is not None:
params["startTime"] = start_time
if end_time is not None:
params["endTime"] = end_time
endpoint = "/fapi/v1/klines" if self.futures else "/api/v3/klines"
raw_candles = self._make_request(endpoint, params)
candles = []
if raw_candles is not None:
for c in raw_candles:
candles.append((float(c[0]), float(c[1]), float(c[2]), float(c[3]), float(c[4]), float(c[5]),))
return candles
else:
return None
def ms_to_dt_utc(ms: int) -> datetime:
return datetime.utcfromtimestamp(ms / 1000)
def ms_to_dt_local(ms: int) -> datetime:
return datetime.fromtimestamp(ms / 1000)
def GetDataFrame(data):
df = pd.DataFrame(data, columns=['Timestamp', "Open", "High", "Low", "Close", "Volume"])
df["Timestamp"] = df["Timestamp"].apply(lambda x: ms_to_dt_local(x))
df['Date'] = df["Timestamp"].dt.strftime("%d/%m/%Y")
df['Time'] = df["Timestamp"].dt.strftime("%H:%M:%S")
column_names = ["Date", "Time", "Open", "High", "Low", "Close", "Volume"]
df = df.set_index('Timestamp')
df = df.reindex(columns=column_names)
return df
def GetHistoricalData(client, symbol, start_time, end_time, limit=1500):
collection = []
while start_time < end_time:
data = client.get_historical_data(symbol, start_time=start_time, end_time=end_time, limit=limit)
print(client.exchange + " " + symbol + " : Collected " + str(len(data)) + " initial data from "+ str(ms_to_dt_local(data[0][0])) +" to " + str(ms_to_dt_local(data[-1][0])))
start_time = int(data[-1][0] + 1000)
collection +=data
time.sleep(1.1)
return collection
client = BinanceClient(futures=False)
symbol = "BTCUSDT"
interval = "1m"
fromDate = int(datetime.strptime('2021-11-15', '%Y-%m-%d').timestamp() * 1000)
toDate = int(datetime.strptime('2021-11-16', '%Y-%m-%d').timestamp() * 1000)
data = GetHistoricalData(client, symbol, fromDate, toDate)
df = GetDataFrame(data)
df
based on Mike Malyi and isnvi23h4's answer:
Please use python >= 3.7, the code does not need to install any dependencies
import pandas as pd
from datetime import datetime, timezone, timedelta
import calendar
def get_klines_iter(symbol, interval, start, end = None, limit=1000):
# start and end must be isoformat YYYY-MM-DD
# We are using utc time zone
# the maximum records is 1000 per each Binance API call
df = pd.DataFrame()
if start is None:
print('start time must not be None')
return
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
if end is None:
dt = datetime.now(timezone.utc)
utc_time = dt.replace(tzinfo=timezone.utc)
end = int(utc_time.timestamp()) * 1000
return
else:
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
last_time = None
while len(df) == 0 or (last_time is not None and last_time < end):
url = 'https://api.binance.com/api/v3/klines?symbol=' + \
symbol + '&interval=' + interval + '&limit=1000'
if(len(df) == 0):
url += '&startTime=' + str(start)
else:
url += '&startTime=' + str(last_time)
url += '&endTime=' + str(end)
df2 = pd.read_json(url)
df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']
dftmp = pd.DataFrame()
dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
dftmp['Date'] = dftmp.Opentime.dt.strftime("%d/%m/%Y")
dftmp['Time'] = dftmp.Opentime.dt.strftime("%H:%M:%S")
dftmp = dftmp.drop(['Quote asset volume', 'Closetime', 'Opentime',
'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
column_names = ["Date", "Time", "Open", "High", "Low", "Close", "Volume"]
dftmp.reset_index(drop=True, inplace=True)
dftmp = dftmp.reindex(columns=column_names)
string_dt = str(dftmp['Date'][len(dftmp) - 1]) + 'T' + str(dftmp['Time'][len(dftmp) - 1]) + '.000Z'
utc_last_time = datetime.strptime(string_dt, "%d/%m/%YT%H:%M:%S.%fZ")
last_time = (utc_last_time - datetime(1970, 1, 1)) // timedelta(milliseconds=1)
df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)
df.to_csv('0y_eth_only17andnew.csv', sep='\t', index=False)
get_klines_iter('ETHBUSD', '30m', '2022-01-01', '2022-02-21')
I do it like this:
def get_binance_data(api_key, pair, countdown='open', interval='4h', start='1 Jan 2018', end=None):
client = Client(api_key=api_key)
intervals = {
'15m': Client.KLINE_INTERVAL_15MINUTE,
'1h': Client.KLINE_INTERVAL_1HOUR,
'4h': Client.KLINE_INTERVAL_4HOUR,
'1d': Client.KLINE_INTERVAL_1DAY
}
interval = intervals.get(interval, '4h')
print(f'Historical interval {interval}')
klines = client.get_historical_klines(symbol=pair, interval=interval, start_str=start, end_str=end)
data = pd.DataFrame(klines)
data.columns = ['open_time','open', 'high', 'low', 'close', 'volume','close_time', 'qav','num_trades','taker_base_vol','taker_quote_vol', 'ignore']
data.index = [pd.to_datetime(x, unit='ms').strftime('%Y-%m-%d %H:%M:%S') for x in data.open_time]
usecols=['open', 'high', 'low', 'close', 'volume']
data = data[usecols]
data = data.astype('float')
return data
api_key = 'хххх...xxx' # use your api-key
symbol = 'ETHUSDT'
eth = get_binance_data(api_key, symbol)
eth.head()
Output:
Historical interval 4h
open high low close volume
2018-01-01 00:00:00 733.01 737.99 716.80 734.50 8739.23361
2018-01-01 04:00:00 734.99 763.55 730.01 751.99 9492.34734
2018-01-01 08:00:00 751.77 759.00 730.58 741.01 8939.36851
2018-01-01 12:00:00 741.01 752.27 724.15 748.80 11284.08664
2018-01-01 16:00:00 748.27 749.98 733.00 746.23 7757.00362
import requests
market = 'ETHEUR'
tick_interval = '1d'
url = 'https://api.binance.com/api/v3/klines?symbol='+market+'&interval='+tick_interval
data = requests.get(url).json()
print(data)

Overwriting one data with another data in pandas(dataframe)

Periodically (every 120 seconds) get data but recent data overwrites previous data in SQL DB. I want all data to be saved.In addition, is the timer correct?
import sqlalchemy as sa
import psycopg2
import requests as rq
import pandas as pd
import json
import time
start_time = time.time()
while True:
temp = pd.DataFrame()
df = pd.DataFrame()
vehicleList = {"SN63NBK", "YY67UTP"}
for ids in vehicleList:
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
r = r.text
temp = pd.read_json(r)
temp['Type'] = 'ids'
df = pd.concat([df, temp], sort=False).reset_index(drop=True)
engine = sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus_pg6', engine, if_exists='replace', index=False)
time.sleep(120.0 - ((time.time() - start_time) % 120.0))
I changed your code slightly, but I think the main problem is in if_exists parameter which you should set to append, as #K753 have mentioned in the comments.
Also, YY67UTP id returns nothing, so I replaced it with another random id from the site to illustrate how code works.
def _data_gen(vehicles):
""" Yields a dataframe for each request """
for ids in vehicles:
time.sleep(1)
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
temp = pd.read_json(r.text)
temp['Type'] = ids
yield temp
while True:
# how do you break from while loop if you need to?
vehicleList = {"SN63NBK", "YY67UTP"}
df = pd.concat(_data_gen(vehicleList), sort=False, ignore_index=True)
engine = sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus_pg6', engine, if_exists='append', index=False)
time.sleep(120)

dataframe append new series column with data

I have a Panda DataFrame structure and I want to add another column to it, but I can't do it with append, add or insert.
I'm trying to replicate the portfolio data with the Panda's built-in function, because this script doesn't give me correct data if the period that I request is lower than ~ 1,5 years while data must be obtained even for two days if I want. So here's the script that I want to rewrite:
import QSTK.qstkutil.qsdateutil as du
import QSTK.qstkutil.tsutil as tsu
import QSTK.qstkutil.DataAccess as da
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
ls_symbols = ["AAPL", "GLD", "GOOG", "$SPX", "XOM"]
dt_start = dt.datetime(2006, 1, 1)
dt_end = dt.datetime(2010, 12, 31)
dt_timeofday = dt.timedelta(hours=16)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
c_dataobj = da.DataAccess('Yahoo')
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
ldf_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
**d_data = dict(zip(ls_keys, ldf_data))**
d_data = dict(zip(ls_keys, ldf_data)) is what I want to replicate because it doesn't fetch the data that I want, but I need to figure out a way to append a new column to my dict. Here is my script:
from pandas.io.data import DataReader, DataFrame
import QSTK.qstkutil.qsdateutil as du
import QSTK.qstkutil.DataAccess as da
import datetime as dt
def get_historical_data(symbol, source, date_from, date_to):
global data_validator
symbol_data = {}
ls_keys = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
for key in ls_keys:
symbol_data[key] = DataFrame({})
dataframe_open = DataFrame({})
for item in symbol:
print 'Fetching data for:', item
current_data = DataReader(str(item), source, date_from, date_to)
dataframe_open = {item : current_data['Open']}
if len(symbol_data['Open'].columns) == 0:
symbol_data['Open'] = DataFrame(dataframe_open)
else:
**#i want to add the new column here but can't seem to do this.**
#symbol_data['Open'].loc[:item] = DataFrame(dataframe_open)
pass
return symbol_data
P.S. I call the func with these parameters for testing purposes:
test = get_historical_data(['SPY', 'DIA'], 'yahoo', datetime(2015,1,1), datetime(2015,1,31))
Does the following help? Have not tested yet, but should work in principle... Just put the data in arrays of equal length and construct the data frame from that.
def get_historical_data(symbols=[], source=None, date_from=None, date_to=None):
global data_validator
symbol_data = {}
ls_keys = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
data = []
for item in ls_keys:
data.append(DataReader(str(item), source, date_from, date_to)
symbol_dataframe=DataFrame(data=data, columns=ls_keys)
#symbol_dataframe = DataFrame()
#for key in ls_keys:
# symbol_data[key] = DataFrame({})
#dataframe_open = DataFrame({})
#for item in symbols:
''' print 'Fetching data for:', item
current_data = DataReader(str(item), source, date_from, date_to)
dataframe_open = {item : current_data['Open']}
#print(dataframe_open)
if len(symbol_data['Open'].columns) == 0:
symbol_data['Open'] = DataFrame(dataframe_open)
else:
#i want to add the new column here but can't seem to do this.**
symbol_data['Open'] = DataFrame(dataframe_open)
symbol_data.head()
'''
return symbol_dataframe

Categories