Overwriting one data with another data in pandas(dataframe) - python

Periodically (every 120 seconds) get data but recent data overwrites previous data in SQL DB. I want all data to be saved.In addition, is the timer correct?
import sqlalchemy as sa
import psycopg2
import requests as rq
import pandas as pd
import json
import time
start_time = time.time()
while True:
temp = pd.DataFrame()
df = pd.DataFrame()
vehicleList = {"SN63NBK", "YY67UTP"}
for ids in vehicleList:
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
r = r.text
temp = pd.read_json(r)
temp['Type'] = 'ids'
df = pd.concat([df, temp], sort=False).reset_index(drop=True)
engine = sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus_pg6', engine, if_exists='replace', index=False)
time.sleep(120.0 - ((time.time() - start_time) % 120.0))

I changed your code slightly, but I think the main problem is in if_exists parameter which you should set to append, as #K753 have mentioned in the comments.
Also, YY67UTP id returns nothing, so I replaced it with another random id from the site to illustrate how code works.
def _data_gen(vehicles):
""" Yields a dataframe for each request """
for ids in vehicles:
time.sleep(1)
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
temp = pd.read_json(r.text)
temp['Type'] = ids
yield temp
while True:
# how do you break from while loop if you need to?
vehicleList = {"SN63NBK", "YY67UTP"}
df = pd.concat(_data_gen(vehicleList), sort=False, ignore_index=True)
engine = sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus_pg6', engine, if_exists='append', index=False)
time.sleep(120)

Related

Unable to process large amount of data using for loop

I am downloading 2 years worth of OHLC for 10k symbols and writing it to database. When I try to pull the entire list it crashes (but doesn't if I download 20%):
import config
from alpaca_trade_api.rest import REST, TimeFrame
import sqlite3
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
start_date = (datetime.datetime.now() - relativedelta(years=2)).date()
start_date = pd.Timestamp(start_date, tz='America/New_York').isoformat()
end_date = pd.Timestamp(datetime.datetime.now(), tz='America/New_York').isoformat()
conn = sqlite3.connect('allStockData.db')
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
origin_symbols = pd.read_sql_query("SELECT symbol, name from stock", conn)
df = origin_symbols
df_dict = df.to_dict('records')
startTime = datetime.datetime.now()
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
temp_data = []
for key in df_dict:
symbol = key['symbol']
print(f"downloading ${symbol}")
# stock_id = key['id']
barsets = api.get_bars_iter(symbol, TimeFrame.Day, start_date, end_date)
barsets = list(barsets)
for index, bar in enumerate(barsets):
bars = pd.DataFrame({'date': bar.t.date(), 'symbol': symbol, 'open': bar.o, 'high': bar.h, 'low': bar.l, 'close': bar.c, 'volume': bar.v, 'vwap': bar.vw}, index=[0])
temp_data.append(bars)
print("loop complete")
data = pd.concat(temp_data)
# write df back to sql, replacing the previous table
data.to_sql('daily_ohlc_init', if_exists='replace', con=conn, index=True)
endTime = datetime.datetime.now()
print(f'time elapsed to pull data was {endTime - startTime}')
To make it work I add this line after df_dict to limit symbols downloaded:
df_dict = df_dict[0:2000]
This will allow me to write to database but I need the entire dictionary (about 10k symbols). How do I write to the database without it crashing?
Since you mentioned that you are able to make it work for 2000 records of df_dict at a time, a possible simple approach could be:
api = REST(config.api_key_id, config.api_secret, base_url=config.base_url)
num_records = len(df_dict)
chunk_size = 2000
num_passes = num_records // chunk_size + int(num_records % chunk_size != 0)
for i in range(num_passes):
start = i * chunk_size
end = min((i + 1) * chunk_size, num_records)
df_chunk = df_dict[start: end]
temp_data = []
for key in df_chunk:
symbol = key['symbol']
print(f"downloading ${symbol}")
barsets = api.get_bars_iter(symbol, TimeFrame.Day, start_date, end_date)
barsets = list(barsets)
for index, bar in enumerate(barsets):
bars = [bar.t.date(), symbol, bar.o, bar.h, bar.l, bar.c, bar.v, bar.vw]
temp_data.append(bars)
# should be a bit more efficient to create a dataframe just once
columns = ['date', 'symbol', 'open', 'high', 'low', 'close', 'volume', 'vwap']
data = pd.DataFrame(temp_data, columns=columns)
# should delete previous table when writing first chunk, then start appending from next passes through df_dict
data.to_sql('daily_ohlc_init', if_exists='replace' if i == 0 else 'append', con=conn, index=True)
print(f"Internal loop finished processing records {start} to {end} out of {num_records}.")
endTime = datetime.datetime.now()
print(f'time elapsed to pull data was {endTime - startTime}')

Create merged df based on the url list [pandas]

I was able to extract the data from url_query url, but additionally, I would like to get the data from the urls_list created based on the query['ids'] column from dataframe. Please see below the current logic:
url = 'https://instancename.some-platform.com/api/now/table/data?display_value=true&'
team = 'query=group_name=123456789'
url_query = url+team
dataframe: query
[ids]
0 aaabbb1cccdddeee4ffggghhhhh5iijj
1 aa1bbb2cccdddeee5ffggghhhhh6iijj
issue_list = []
for issue in query['ids']:
issue_list.append(f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}')
response = requests.get(url_query, headers=headers,auth=auth, proxies=proxies)
data = response.json()
def api_response(k):
dct = dict(
event_id= k['number'],
created_time = k[‘created’],
status = k[‘status’],
created_by = k[‘raised_by’],
short_desc = k[‘short_description’],
group = k[‘team’]
)
return dct
raw_data = []
for p in data['result']:
rec = api_response(k)
raw_data.append(rec)
df = pd.DataFrame.from_records(raw_data)
df:
The url_query response extracts what I need, but the key is that I would like to add to the existing one 'df' add the data from the issue_list = []. I don't know how to put the issue_list = [] to the response. I've tried to add issue_list to the response = requests.get(issue_list, headers=headers,auth=auth, proxies=proxies) statement, but I've got invalid schema error.
You can create list of DataFrames with query q instead url_query and last join together by concat:
dfs = []
for issue in query['ids']:
q = f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}'
response = requests.get(q, headers=headers,auth=auth, proxies=proxies)
data = response.json()
raw_data = [api_response(k) for p in data['result']]
df = pd.DataFrame.from_records(raw_data)
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

Webscraping data from a json source, why i get only 1 row?

I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')

How to run a function on each row in DataFrame and append the result to a new DataFrame

NB My code runs if copied
I wrote a simple script to backtest cryptocurrencies using the poloniex API.
First I request the data from the API and turn it into a dataframe data.
Then I take the data I want and make new df called df
A function trade must then be run on each line in df, simple put if the price is above the rolling mean it buys and sells if below, this data is then saved in log.
I am having trouble applying this function on each row in df.
I had great success using the line log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1) BUT surprising it works when BTC_ETH is used in the API call and not for others ie BTC_FCT or BTC_DOGE despite the data being identical in form. Using ETH results in the creation of DataFrame (which is what i want) DOGE and FCT creates a Series
First question, how can I run my trade function on each row and create a new df log with the results
Bonus question, even though the data types are the same why does it work for ETH but not for DOGE/FCT ?
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData&currencyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
print()
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
EDIT:
I solved the problem, I fixed it by appending the dicts to list and then using the df.from_dict() method to create the log dataframe, my code just to clarify.
def trade(date, close, MA):#, port):
#d = {'Data': close}
#test_log = test_log.append(d, ignore_index=True)
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d = {'Date':date, 'type':'buy', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d = {'Date':date, 'type':'sell', 'coin_value': port['coin'], 'btc_value':port['BTC']}
data_list.append(d)
#return pd.Series(d)
df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
for key,value in port.items():
print(key, value )
log.from_dict(data_list)
The problem is that you are not always returning a value in trade, which is confusing Pandas. Try this:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
API = 'https://poloniex.com/public?command=returnChartData&currencyPair=BTC_FCT&start=1435699200&end=9999999999&period=86400'
data = pd.read_json(API)
df = pd.DataFrame(columns = {'date','close','MA'})
df.MA = pd.rolling_mean(data.close, 30)
df.close = data.close
df.date = data.date
df = df.truncate(before=29)
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
log = pd.DataFrame(columns = ['Date', 'type', 'profit', 'port_value'])
port = {'coin': 0, 'BTC':1}
port = {'coin': 0, 'BTC':1}
def trade(date, close, MA):
d = {'Date': date, 'type':'', 'coin_value': np.nan, 'btc_value': np.nan}
if MA < close and port['coin'] == 0 :
coins_bought = port['BTC']/MA
port['BTC'] = 0
port['coin'] = coins_bought
d['type'] = 'buy'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
elif MA > close and port['BTC'] == 0 :
coins_sold = port['coin']*MA
port['coin'] = 0
port['BTC'] = coins_sold
d['type'] = 'sell'
d['coin_value'] = port['coin']
d['btc_value'] = port['BTC']
return pd.Series(d)
log = df.apply(lambda x: trade(x['date'], x['close'], x['MA']), axis=1)
log = log.dropna()
print_full(log)
However, as I mentioned in the comment, passing a function with side-effects to apply is not a good idea according to the documentation, and in fact I think it may not produce the correct result in your case.

Use of function in pandas dataframe with multiprocessing

I am attempting to speed up calculations on a pandas DataFrame using multiprocessing which goes really well minus the fact that assigning the result of the calculation to the df.ix does not work here like it does in my code without trying multiprocessing here
I've added a #sanity check to the code which outputs valid values and would make me think this would work just fine, but the DataFrame doesn't get populated (stays as NaN). Does anyone know why that may be, and more importantly, what changes may be needed to plug the values into the DataFrame in the context of multiprocessing?
Output of sanity check:
should be setting df.ix[4][1] to: 23.2506112824
should be setting df.ix[0][0] to: 0.0
should be setting df.ix[7][0] to: 15.9574526264
code:
import mysql.connector
import numpy as np
from colormath.color_objects import LabColor
from colormath.color_diff import delta_e_cie2000
import pandas as pd
from mysql.connector.pooling import MySQLConnectionPool
from multiprocessing import Pool
pool = Pool()
cnx = mysql.connector.connect(user='user', password='pass',host='localhost', database='database')
cursor = cnx.cursor()
selectstmt = 'SELECT CIE_Lab, ID FROM `database`.`table`'
cursor.execute(selectstmt)
color = cursor.fetchall()
df = pd.DataFrame(columns = color, index = color)
sides = df.index
headers = df.dtypes.index
shape = df.shape[0]
def delta(cie_Lab1, cie_Lab2):
cie_Lab1 = cie_Lab1[1:]
cie_Lab1 = cie_Lab1[:-1]
cie_Lab2 = cie_Lab2[1:]
cie_Lab2 = cie_Lab2[:-1]
CIE_list1 = cie_Lab1.split(",")
CIE_list2 = cie_Lab2.split(",")
#print CIE_list1
CIE_L1 = CIE_list1[0]
CIE_a1 = CIE_list1[1]
CIE_b1 = CIE_list1[2]
CIE_L2 = CIE_list2[0]
CIE_a2 = CIE_list2[1]
CIE_b2 = CIE_list2[2]
color1 = LabColor(lab_l=CIE_L1, lab_a=CIE_a1, lab_b=CIE_b1)
color2 = LabColor(lab_l=CIE_L2, lab_a=CIE_a2, lab_b=CIE_b2)
deltae = delta_e_cie2000(color1, color2, Kl=1, Kc=1, Kh=1)
return deltae
def deltas(nums):
listoflists = []
for num in range(nums):
for mun in range(nums):
listoflists.append([num,mun])
return listoflists
def update(inp):
sides = df.index
headers = df.dtypes.index
num = inp[0]
mun = inp[1]
res = delta(headers[num][0], sides[mun][0])
#sanity check
print "should be setting df.ix["+str(mun)+"]["+str(num)+"] to: "+str(res)
df.ix[mun][num] = res
if __name__ == '__main__':
pool = Pool(4)
pool.map(update, deltas(shape))
pool.close()
pool.join()
print df
Dataframe example:
([69.62248143012944, -54.15108764844451, 67.92070706614288], 1) \
([69.62248143012944, -54.15108764844451, 67.920... NaN
([58.17848217611454, -52.251714243997995, 56.77... NaN
([87.02539335188214, -32.15758725885986, 66.450... NaN
([86.86259502866965, -31.483524711078015, 75.14... NaN
([85.39154525710671, -31.683349117376856, 71.35... NaN

Categories