How to appy threading to make it run faster?

How to appy threading to make it run faster? - python

I have this code to load data to file. I want to make it run concurrently using threads to make it faster. Some people recommended to use asyncio but I could'nt really understand it. This code is for cleaning a csv file. For eg it cleans reads date in arabic format and changes it to the english calender. Can anyone provide a brief overview of how this can be done.
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 00:03:38 2020
#author: siradmin
****** DATE ISSUES CODE ******
The purpose of this code is to correct date in different date columns
"""
import os
os.chdir("D://Medgulf Motor/2022/Code for date cleaning")
os.getcwd()
import pandas as pd
import datetime as dt
#df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.txt",
# engine='python', sep=';', chunksize=100000)
df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.csv",
engine='python', chunksize=100000 )
columns = ['Issue Date','Inception Date','Expiry Date', 'Policy Status Date',
'Vehicle Issue Date', 'Vehicle Inception Date','Vehicle Expiry Date',
'Status Date', 'Insured Date of Birth','Main Driver DOB']
# 'Istemarah Exp.', 'Additional Driver DOB']
fmts2 = ['%d/%m/%Y', '%d/%m/%y', '%d-%m-%Y', '%d-%m-%y', '%m/%d/%Y', '%Y/%m/%d',
'%Y-%m-%d', '%d|%m|%Y']
new_date = []
j = []
isd,ind,exd,psd,visd,vind,vexd,sd,ise,idb,mdd,add=(pd.DataFrame,)*12
header_flag = True
## Actual Code ##
print(dt.datetime.now())
for cx, chunk in enumerate(df):
for col in columns:
new_date = []
for idx, x in enumerate(chunk[col]):
try:
x = int(x)
dd = dt.datetime(1900,1,1)
da = dt.timedelta(days=int(x)-2)
nd = dd + da
x = nd.date()
except:
pass
for fmt in fmts2:
try:
x = str(x)
# x = str(x).replace("//0/", "/0")
# x = str(x).replace("//1/", "/1")
# x = str(x).replace("//2/", "/2")
x = str(x).replace(" 00:00:00", "")
x = str(x).replace("0/0/", "1/1/")
x = str(x).replace("/0/", "/01/")
x = str(x).replace("/2/", "/02/")
date_object = dt.datetime.strptime(x.strip(), fmt).date()
new_date.append((date_object))
break
except:
pass
if len(new_date) != idx:
pass
elif "29/02" in x or "29-02" in x:
new_date.append((x))
else:
# x = "None"
new_date.append(("")) #new_date.append((x))
match col:
case "Issue Date":
isd = isd.append(chunk.iloc[[idx]])
case "Inception Date":
ind = ind.append(chunk.iloc[[idx]])
case "Expiry Date":
exd = exd.append(chunk.iloc[[idx]])
case "Policy Status Date":
psd = psd.append(chunk.iloc[[idx]])
case "Vehicle Issue Date":
visd = visd.append(chunk.iloc[[idx]])
case "Vehicle Inception Date":
vind = vind.append(chunk.iloc[[idx]])
case "Vehicle Expiry Date":
vexd = vexd.append(chunk.iloc[[idx]])
case "Istemarah Exp.":
ise = ise.append(chunk.iloc[[idx]])
case "Main Driver DOB":
mdd = mdd.append(chunk.iloc[[idx]])
case "Additional Driver DOB":
add = add.append(chunk.iloc[[idx]])
# if col == "Issue Date":
# isd = isd.append(chunk.iloc[[idx]])
# if col == "Inception Date":
# ind = ind.append(chunk.iloc[[idx]])
# if col == "Expiry Date":
# exd = exd.append(chunk.iloc[[idx]])
# if col == "Policy Status Date":
# psd = psd.append(chunk.iloc[[idx]])
# if col == "Vehicle Issue Date":
# visd = visd.append(chunk.iloc[[idx]])
# if col == "Vehicle Inception Date":
# vind = vind.append(chunk.iloc[[idx]])
# if col == "Vehicle Expiry Date":
# vexd = vexd.append(chunk.iloc[[idx]])
# if col == "Istemarah Exp.":
# ise = ise.append(chunk.iloc[[idx]])
# # if col == "Insured Date of Birth":
# # idb = idb.append(chunk.iloc[[idx]])
# if col == "Main Driver DOB":
# mdd = mdd.append(chunk.iloc[[idx]])
# if col == "Additional Driver DOB":
# add = add.append(chunk.iloc[[idx]])
chunk[col] = j = ['{}'.format(t) for idx, t in enumerate(new_date)]
# chunk[col] = pd.to_datetime(chunk[col])
print ("Completed", col)
print ('we have completed ', cx, 'chunk\n')
chunk.to_csv('Tricast Policy Data.csv', mode='a', index =False, header = header_flag)
header_flag = False
print(dt.datetime.now())
if len(isd) != 0:
isd.to_csv("Issuedate.csv")
if len(ind) != 0:
ind.to_csv("Inceptiondatecsv")
if len(exd) != 0:
exd.to_csv("Expirydate.csv")
if len(psd) != 0:
psd.to_csv("policystatedate.csv")
if len(visd) != 0:
visd.to_csv("vehicleissuedate.csv")
if len(vind) != 0:
vind.to_csv("vehicleinceptiondate.csv")
if len(vexd) != 0:
vexd.to_csv("vehicleexpirydate.csv")
if len(sd) != 0:
sd.to_csv("statusdate.csv")
if len(ise) != 0:
ise.to_csv("istemarhexpiry.csv")
if len(idb) != 0:
idb.to_csv("insureddateofbirth.csv")
if len(mdd) != 0:
mdd.to_csv("maindriverdob.csv")
if len(add) != 0:
add.to_csv("adddriverdob.csv")
###############################################################################
Edit: this is the whole code.
My supervisor told me concurrency can be applied to the last part where the data is being loaded to the csv files.

Related

How can I take the last value -1 pandas

I am trying to do a function where I check if a date is in my excel file, and if unfortunately it is not. I retrieve the date before.
I succeeded with the after date and here is my code.
Only with the date before, I really can't do it.
i tried this for the day before:
def get_all_dates_between_2_dates_with_special_begin_substraction(Class, date_départ, date_de_fin, date_debut_analyse, exclus=False):
date_depart = date_départ
date_fin = date_de_fin
result_dates = []
inFile = "database/Calendar_US_Target.xlsx"
inSheetName = "Sheet1"
df =(pd.read_excel(inFile, sheet_name = inSheetName))
date_depart = datetime.datetime.strptime(date_depart, '%Y-%m-%d')
date_fin = datetime.datetime.strptime(date_fin, '%Y-%m-%d')
date_calcul_depart = datetime.datetime.strptime(date_debut_analyse, '%Y-%m-%d')
var_date_depart = date_depart
time_to_add = ""
if (Class.F0 == "mois"):
time_to_add = relativedelta(months=1)
if (Class.F0 == "trimestre"):
time_to_add = relativedelta(months=3)
if (Class.F0 == "semestre"):
time_to_add = relativedelta(months=6)
if (Class.F0 == "année"):
time_to_add = relativedelta(years=1)
while var_date_depart <= date_fin:
-------------------------------------------------------------
df['mask'] = (var_date_depart <= df['TARGETirs_holi']) # daybefore
print(df.head())
print(df[df.mask =="True"].head(1)) #want to check the last true value
------------------------------------------------------------------------------
if (result >= date_calcul_depart):
result = (str(result)[0:10])
result = result[8:10] + "/" + result[5:7] + "/" + result[0:4]
result_dates.append(str(result))
var_date_depart = var_date_depart + time_to_add
if (exclus == True):
result_dates = result_dates[1:-1]
return(result_dates)
I want to say, do a column (or a dataframe) where the first date is true where the first date smaller than the second then i take the last value who is true.
for example:
I have this array [12-05-2022,15-05-2022,16-05-2022 and 19-05-2022]
if i put 15-05-2022, it gives me 15-05-2022, but if i put 18-05-2022, its gives me 16-05-2022
Thanks!

How to create library of a self-written static function in python

I have a python script which has multiple static functions. I want to convert that complete python script into a python library
import pandas as pd
import numpy as np
import EA_Upload_config as cfg
import datetime
#%%
def clockPrint(sentence):
now = datetime.datetime.now()
date_time = now.strftime("%H:%M:%S")
print(date_time + " : " + sentence)
def uploadToEA(df_,ds_api_name,operation_,instance,xmd_=None): #Upsert #Overwrite
import SalesforceEinsteinAnalytics as EA
clockPrint("Upload Process Initiated for "+instance+" instance...")
if instance.lower() == 'commercial':
EAS = EA.salesforceEinsteinAnalytics(env_url='https://spglobalratings.my.salesforce.com', browser='chrome')
if instance.lower() == 'analytical':
EAS = EA.salesforceEinsteinAnalytics(env_url='https://spglobalratingsae.my.salesforce.com', browser='chrome')
EAS.load_df_to_EA(df_,dataset_api_name=ds_api_name, operation=operation_,xmd=xmd_,fillna=False) #Error because of fillna=False
clockPrint("Upload Process Completed successfully for "+instance+" instance. Navigate to (Einstein Analytics --> Data Manager --> Monitor) to check progress.")
def processDate(date):
if pd.isnull(date):
return np.nan
else:
date = pd.to_datetime(date)
date = datetime.datetime.strftime(date,"%m/%d/%Y")
return date
if __name__ == '__main__':
df = pd.read_csv(cfg.FILE_PATH)
if len(cfg.DATE_COLUMNS) != 0:
for c in cfg.DATE_COLUMNS:
df[c] = df[c].apply(lambda x: processDate(x))
for c in df.columns:
if df[c].dtype == "O":
df[c].fillna('', inplace=True)
elif np.issubdtype(df[c].dtype, np.number):
df[c].fillna(0, inplace=True)
elif df[c].dtype == "datetime64[ns]":
df[c] = df[c].apply(lambda x: processDate(x))
df[c].fillna("", inplace=True)
df.fillna("", inplace=True)
for instance in cfg.INSTANCES:
if instance.lower() == 'commercial':
uploadToEA(df, cfg.COM_DATASET_API_NAME, cfg.COM_OPERATION, instance, cfg.COM_XMD)
elif instance.lower() == 'analytical':
uploadToEA(df, cfg.ANA_DATASET_API_NAME, cfg.ANA_OPERATION, instance, cfg.ANA_XMD)
else: clockPrint("Update INSTANCES variable as ['Commercial'] or ['Analytical'] or ['Commercial','Analytical'].")
This is my complete python script which I want to convert it into a library. How should I do it?

How to update PySimpleGUI Listbox that reads an excel file

I am using python3.7 and this is the current code base(apologies for putting so much code but thought it would help overall)
def TRADE_ENTRY(df_names, df_underlyings,df_strategies, columns, param, out_path,recovery_path):
nameUpdate =0
strategyUpdate=0
underlyingUpdate=0
sg.theme('Dark Brown 1')
listing = [sg.Text(u, size = param) for u in columns]
header = [[x] for x in listing]
now = datetime.datetime.now()
core = [
sg.Input(f"{now.month}/{now.day}/{now.year}",size = param),
sg.Input(f"{now.hour}:{now.minute}:{now.second}",size = param),
sg.Listbox(list(df_strategies.STRATEGIES), size=(20,2), enable_events=False, key='_PLAYERS0_'),
sg.Listbox(['ETF', 'EQT', 'FUT', 'OPT', 'BOND'],enable_events=False,key='_PLAYERS20_',size = (20,2)),
sg.Listbox(list(df_names.NAMES), size=(20,4), enable_events=False,key='_PLAYERS6_'),
sg.Listbox( ['B', 'S'],size = (20,1),enable_events=False,key='_PLAYERS12_'),
sg.Input(size = param),
sg.Input(size = param),
sg.CalendarButton('Calendar', pad=None, font=('MS Sans Serif', 10, 'bold'),
button_color=('yellow', 'brown'), format=('%d/%m/%Y'), key='_CALENDAR_', target='_INP_'),
sg.Input(size = param),
sg.Listbox(list(df_underlyings.UNDERLYINGS), size=(20,4), enable_events=False,key='_PLAYERS2_'),
sg.Listbox(['C', 'P', 'N/A'],size = param),
]
mesh = [[x,y] for (x,y) in list(zip(listing, core))]
mesh[8].append(sg.Input(size = (10,2),key = '_INP_'))
layout =[[sg.Button("SEND"),sg.Button("NEW_NAME"), sg.Button("NEW_STRAT"), sg.Button("NEW_UND")] ]+ mesh
window = sg.Window('Trade Entry System', layout, font='Courier 12').Finalize()
while True:
event, values = window.read(timeout=500)
#print('EVENT, VALUES', event, values)# all the inputs with extra information for compiler
if event == "SEND":
data = values
a = list(data.values())
a = [x if isinstance(x, list) == False else empty_handler(x) for x in a]
a = [x if x !="" else "EMPTY" for x in a ]
#print('A', a)#all the inputs now in a list
df = pd.DataFrame(a, index = columns)
print('DF1', df)#columns dataframe with column names and then the values
df = df.transpose()
#print('DF2', df)#rows dataframe with column names and then the values
status = error_handling(df)
#print('STATUS', status)
if status == "ERROR":
print("YOU MUST RECTIFY INPUT")
elif status == "CORRECT":
#if a future then will overwrite its name
if df['TYPE'][0] == "FUT":
df['NAME'][0] = "F-"+ df['UNDERLYING'][0] + "-" +df['EXPIRATION'][0]
#if an option then will overwrite its name
elif df['TYPE'][0] =="OPT":
df['NAME'][0] = 'O-' + df['UNDERLYING'][0] + "--" + df['OPTION_TYPE'][0] +df['STRIKE'][0] +"--" +df['EXPIRATION'][0]
else:
pass
processing(df, recovery_path, out_path)
else:
print("ERROR WITH USER INPUT FATAL")
break
elif event == "NEW_NAME":
security_creation(r'Y:\NAMES.xlsx', "Sheet1", "NAME", param)
nameUpdate=1
continue
elif event == "NEW_STRAT":
security_creation(r'Y:\STRATEGIES.xlsx', "Sheet1", "STRATEGY", param)
strategyUpdate=1
continue
elif event == "NEW_UND":
security_creation(r'Y:\UNDERLYINGS.xlsx', "Sheet1", "UNDERLYINGS", param)
underlyingUpdate=1
continue
elif event == sg.TIMEOUT_KEY:
if(nameUpdate==1):
df_names = pd.read_excel(r'Y:\NAMES.xlsx', "Sheet1")
df =df_names.values.tolist()
window['_PLAYERS6_'].update(values=df, set_to_index=0)
if(underlyingUpdate==1):
df_underlyings = pd.read_excel(r'Y:\UNDERLYINGS.xlsx', "Sheet1")
df =df_underlyings.values.tolist()
window['_PLAYERS2_'].update(values=df, set_to_index=0)
if(strategyUpdate==1):
df_strategies = pd.read_excel(r'Y:\STRATEGIES.xlsx', "Sheet1")
df =df_strategies.values.tolist()
window['_PLAYERS0_'].update(values=df, set_to_index=0)
print("Listboxes updated !")
else:
print("OVER")
break
window.close()
TRADE_ENTRY(df_names, df_underlyings,df_strategies, columns, param,out_path, recovery_path)
Towards the end of the function there's 3 elif, all NEW_NAME, NEW_STRAT and NEW_UND are the user submitting information to the corresponding 3 excel files. The function security_creation actually updates said excel files. Below that I am trying to update the Listboxes but no luck.
Any help would be greatly appreciated since i am so confused

How do I get all the prices history with binance API for a crypto using Python?

I've been using this script to get the prices from some cryptocurrencies using Binance API and this script:
https://steemit.com/python/#marketstack/how-to-download-historical-price-data-from-binance-with-python
The problem is that with this script I cannot control the date range: for example, I want to choose the period range between Dec. 2015 and Dec. 2020, or I want the DAILY PRICES from the first day trading for any crypto ...etc.
So I share with you the code I'm using (copied from the steemit code and modified a little bit)
How can I do it?
# https://steemit.com/python/#marketstack/how-to-download-historical-price-data-from-binance-with-python###
import requests
import json
import pandas as pd
import numpy as np
import datetime as dt
frequency = input("Please enter the frequency (1m/5m/30m/.../1h/6h/1d/ : ")
def get_bars(symbol, interval=frequency):
root_url = 'https://api.binance.com/api/v1/klines'
url = root_url + '?symbol=' + symbol + '&interval=' + interval
data = json.loads(requests.get(url).text)
df = pd.DataFrame(data)
df.columns = ['open_time',
'o', 'h', 'l', 'c', 'v',
'close_time', 'qav', 'num_trades',
'taker_base_vol', 'taker_quote_vol', 'ignore']
df.index = [dt.datetime.fromtimestamp(x / 1000.0) for x in df.close_time]
return df
btcusdt = get_bars('BTCUSDT')
ethusdt = get_bars('ETHUSDT')
df0=pd.DataFrame(btcusdt)
df0.to_csv('_btcusdt.csv')
df1=pd.DataFrame(ethusdt)
df1.to_csv('_ethusdt.csv')
Can anyone help me to optimize it?

I am using this out of the binance documentation : https://python-binance.readthedocs.io/en/latest/binance.html?highlight=get_historical_klines#binance.client.Client.get_historical_klines
import os
from binance.client import Client
import pandas as pd
import datetime, time
def GetHistoricalData(self, howLong):
self.howLong = howLong
# Calculate the timestamps for the binance api function
self.untilThisDate = datetime.datetime.now()
self.sinceThisDate = self.untilThisDate - datetime.timedelta(days = self.howLong)
# Execute the query from binance - timestamps must be converted to strings !
self.candle = self.client.get_historical_klines("BNBBTC", Client.KLINE_INTERVAL_1MINUTE, str(self.sinceThisDate), str(self.untilThisDate))
# Create a dataframe to label all the columns returned by binance so we work with them later.
self.df = pd.DataFrame(self.candle, columns=['dateTime', 'open', 'high', 'low', 'close', 'volume', 'closeTime', 'quoteAssetVolume', 'numberOfTrades', 'takerBuyBaseVol', 'takerBuyQuoteVol', 'ignore'])
# as timestamp is returned in ms, let us convert this back to proper timestamps.
self.df.dateTime = pd.to_datetime(self.df.dateTime, unit='ms').dt.strftime(Constants.DateTimeFormat)
self.df.set_index('dateTime', inplace=True)
# Get rid of columns we do not need
self.df = self.df.drop(['closeTime', 'quoteAssetVolume', 'numberOfTrades', 'takerBuyBaseVol','takerBuyQuoteVol', 'ignore'], axis=1)
print(self.df)
I do hope this helps someone.
(Please note this method is cut out of a class I have, so you may get rid of all of the self-s) , and you need to have your client set up before by
client = Client(api_key, api_secret)
Any improvements are of course welcome !

This is a function that I used.
Start and end are dates in Unix timestamp format. Interval is graph interval.
And keep in mind Binance did not exist in Dec 2015 :-)
def get_klines_iter(symbol, interval, start, end, limit=5000):
df = pd.DataFrame()
startDate = end
while startDate>start:
url = 'https://api.binance.com/api/v3/klines?symbol=' + \
symbol + '&interval=' + interval + '&limit=' + str(iteration)
if startDate is not None:
url += '&endTime=' + str(startDate)
df2 = pd.read_json(url)
df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime', 'Quote asset volume', 'Number of trades','Taker by base', 'Taker buy quote', 'Ignore']
df = pd.concat([df2, df], axis=0, ignore_index=True, keys=None)
startDate = df.Opentime[0]
df.reset_index(drop=True, inplace=True)
return df

from datetime import datetime
import pandas as pd
import requests
from typing import *
import time
class BinanceClient:
def __init__(self, futures=False):
self.exchange = "BINANCE"
self.futures = futures
if self.futures:
self._base_url = "https://fapi.binance.com"
else:
self._base_url = "https://api.binance.com"
self.symbols = self._get_symbols()
def _make_request(self, endpoint: str, query_parameters: Dict):
try:
response = requests.get(self._base_url + endpoint, params=query_parameters)
except Exception as e:
print("Connection error while making request to %s: %s", endpoint, e)
return None
if response.status_code == 200:
return response.json()
else:
print("Error while making request to %s: %s (status code = %s)",
endpoint, response.json(), response.status_code)
return None
def _get_symbols(self) -> List[str]:
params = dict()
endpoint = "/fapi/v1/exchangeInfo" if self.futures else "/api/v3/exchangeInfo"
data = self._make_request(endpoint, params)
symbols = [x["symbol"] for x in data["symbols"]]
return symbols
def get_historical_data(self, symbol: str, interval: Optional[str] = "1m", start_time: Optional[int] = None, end_time: Optional[int] = None, limit: Optional[int] = 1500):
params = dict()
params["symbol"] = symbol
params["interval"] = interval
params["limit"] = limit
if start_time is not None:
params["startTime"] = start_time
if end_time is not None:
params["endTime"] = end_time
endpoint = "/fapi/v1/klines" if self.futures else "/api/v3/klines"
raw_candles = self._make_request(endpoint, params)
candles = []
if raw_candles is not None:
for c in raw_candles:
candles.append((float(c[0]), float(c[1]), float(c[2]), float(c[3]), float(c[4]), float(c[5]),))
return candles
else:
return None
def ms_to_dt_utc(ms: int) -> datetime:
return datetime.utcfromtimestamp(ms / 1000)
def ms_to_dt_local(ms: int) -> datetime:
return datetime.fromtimestamp(ms / 1000)
def GetDataFrame(data):
df = pd.DataFrame(data, columns=['Timestamp', "Open", "High", "Low", "Close", "Volume"])
df["Timestamp"] = df["Timestamp"].apply(lambda x: ms_to_dt_local(x))
df['Date'] = df["Timestamp"].dt.strftime("%d/%m/%Y")
df['Time'] = df["Timestamp"].dt.strftime("%H:%M:%S")
column_names = ["Date", "Time", "Open", "High", "Low", "Close", "Volume"]
df = df.set_index('Timestamp')
df = df.reindex(columns=column_names)
return df
def GetHistoricalData(client, symbol, start_time, end_time, limit=1500):
collection = []
while start_time < end_time:
data = client.get_historical_data(symbol, start_time=start_time, end_time=end_time, limit=limit)
print(client.exchange + " " + symbol + " : Collected " + str(len(data)) + " initial data from "+ str(ms_to_dt_local(data[0][0])) +" to " + str(ms_to_dt_local(data[-1][0])))
start_time = int(data[-1][0] + 1000)
collection +=data
time.sleep(1.1)
return collection
client = BinanceClient(futures=False)
symbol = "BTCUSDT"
interval = "1m"
fromDate = int(datetime.strptime('2021-11-15', '%Y-%m-%d').timestamp() * 1000)
toDate = int(datetime.strptime('2021-11-16', '%Y-%m-%d').timestamp() * 1000)
data = GetHistoricalData(client, symbol, fromDate, toDate)
df = GetDataFrame(data)
df

based on Mike Malyi and isnvi23h4's answer:
Please use python >= 3.7, the code does not need to install any dependencies
import pandas as pd
from datetime import datetime, timezone, timedelta
import calendar
def get_klines_iter(symbol, interval, start, end = None, limit=1000):
# start and end must be isoformat YYYY-MM-DD
# We are using utc time zone
# the maximum records is 1000 per each Binance API call
df = pd.DataFrame()
if start is None:
print('start time must not be None')
return
start = calendar.timegm(datetime.fromisoformat(start).timetuple()) * 1000
if end is None:
dt = datetime.now(timezone.utc)
utc_time = dt.replace(tzinfo=timezone.utc)
end = int(utc_time.timestamp()) * 1000
return
else:
end = calendar.timegm(datetime.fromisoformat(end).timetuple()) * 1000
last_time = None
while len(df) == 0 or (last_time is not None and last_time < end):
url = 'https://api.binance.com/api/v3/klines?symbol=' + \
symbol + '&interval=' + interval + '&limit=1000'
if(len(df) == 0):
url += '&startTime=' + str(start)
else:
url += '&startTime=' + str(last_time)
url += '&endTime=' + str(end)
df2 = pd.read_json(url)
df2.columns = ['Opentime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Closetime',
'Quote asset volume', 'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore']
dftmp = pd.DataFrame()
dftmp = pd.concat([df2, dftmp], axis=0, ignore_index=True, keys=None)
dftmp.Opentime = pd.to_datetime(dftmp.Opentime, unit='ms')
dftmp['Date'] = dftmp.Opentime.dt.strftime("%d/%m/%Y")
dftmp['Time'] = dftmp.Opentime.dt.strftime("%H:%M:%S")
dftmp = dftmp.drop(['Quote asset volume', 'Closetime', 'Opentime',
'Number of trades', 'Taker by base', 'Taker buy quote', 'Ignore'], axis=1)
column_names = ["Date", "Time", "Open", "High", "Low", "Close", "Volume"]
dftmp.reset_index(drop=True, inplace=True)
dftmp = dftmp.reindex(columns=column_names)
string_dt = str(dftmp['Date'][len(dftmp) - 1]) + 'T' + str(dftmp['Time'][len(dftmp) - 1]) + '.000Z'
utc_last_time = datetime.strptime(string_dt, "%d/%m/%YT%H:%M:%S.%fZ")
last_time = (utc_last_time - datetime(1970, 1, 1)) // timedelta(milliseconds=1)
df = pd.concat([df, dftmp], axis=0, ignore_index=True, keys=None)
df.to_csv('0y_eth_only17andnew.csv', sep='\t', index=False)
get_klines_iter('ETHBUSD', '30m', '2022-01-01', '2022-02-21')

I do it like this:
def get_binance_data(api_key, pair, countdown='open', interval='4h', start='1 Jan 2018', end=None):
client = Client(api_key=api_key)
intervals = {
'15m': Client.KLINE_INTERVAL_15MINUTE,
'1h': Client.KLINE_INTERVAL_1HOUR,
'4h': Client.KLINE_INTERVAL_4HOUR,
'1d': Client.KLINE_INTERVAL_1DAY
}
interval = intervals.get(interval, '4h')
print(f'Historical interval {interval}')
klines = client.get_historical_klines(symbol=pair, interval=interval, start_str=start, end_str=end)
data = pd.DataFrame(klines)
data.columns = ['open_time','open', 'high', 'low', 'close', 'volume','close_time', 'qav','num_trades','taker_base_vol','taker_quote_vol', 'ignore']
data.index = [pd.to_datetime(x, unit='ms').strftime('%Y-%m-%d %H:%M:%S') for x in data.open_time]
usecols=['open', 'high', 'low', 'close', 'volume']
data = data[usecols]
data = data.astype('float')
return data
api_key = 'хххх...xxx' # use your api-key
symbol = 'ETHUSDT'
eth = get_binance_data(api_key, symbol)
eth.head()
Output:
Historical interval 4h
open high low close volume
2018-01-01 00:00:00 733.01 737.99 716.80 734.50 8739.23361
2018-01-01 04:00:00 734.99 763.55 730.01 751.99 9492.34734
2018-01-01 08:00:00 751.77 759.00 730.58 741.01 8939.36851
2018-01-01 12:00:00 741.01 752.27 724.15 748.80 11284.08664
2018-01-01 16:00:00 748.27 749.98 733.00 746.23 7757.00362

import requests
market = 'ETHEUR'
tick_interval = '1d'
url = 'https://api.binance.com/api/v3/klines?symbol='+market+'&interval='+tick_interval
data = requests.get(url).json()
print(data)

Is python pandas dataframe too slow?

I have an interesting problem. I have two files, NYPD_Motor_Collisions.csv has 1.2M lines and weatherfinal.txt has 109K lines. The objective is to merge the temp and prec data from weatherfinal.txt to the Collisions files as two columns based on the latitudes and longitudes. I wrote the following code using dataframe in pandas python.
from math import cos, asin, sqrt
import pandas as pd
import numpy as np
import os
import re
import datetime
def distance(lat1, lon1, lat2, lon2):
p = 0.017453292519943295
a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
return 12742 * asin(sqrt(a))
def closest(data, v):
return min(data, key=lambda p: distance(v['lat'],v['lon'],p['lat'],p['lon']))
tempDataList = []
#v = {'lat': 39.7622290, 'lon': -86.1519750}
#print(closest(tempDataList, v))
print os.getcwd()
filed_ = open("weatherfinal.txt", 'r')
fileo_ = open("weatherfinal_updated.txt","w")
lines_ = filed_.readlines()
for line_ in lines_:
outline = re.sub(" +"," ",line_)
fileo_.write(outline + "\n")
fileo_.close()
df = pd.read_csv("NYPD_Motor_Vehicle_Collisions.csv")
colhead = np.append(df.columns.values,['TEMP', 'PREP'])
outdf = pd.DataFrame(columns=colhead)
df2 = pd.read_csv("weatherfinal_updated.txt",' ')
df2.set_index(['WBANNO', 'LST_DATE', 'LST_TIME'])
sensorIds = df2['WBANNO'].unique()
for ids_ in sensorIds:
longitude = df2.loc[df2['WBANNO']==ids_,'LONGITUDE'].iloc[0]
latitude = df2.loc[df2['WBANNO'] == ids_, 'LATITUDE'].iloc[0]
tempDataList.append({'lat':latitude,'lon':longitude,'SENSORID': ids_ })
print tempDataList
for index, row in df.iterrows():
lon_ = row['LONGITUDE']
lat_ = row['LATITUDE']
tdate = row['DATE']
ttime = row['TIME']
tcal = 5
pcal = 0
fwdate = datetime.datetime.strptime(str(tdate), '%m/%d/%Y').strftime('%Y%m%d')
fwtime = datetime.datetime.strptime(str(ttime), '%H:%M').strftime('%H%M')
ntime = float(fwtime) + float(100)
closests_ = closest(tempDataList, {'lat':lat_,'lon':lon_})
sensorid = closests_['SENSORID']
usedSensorId = sensorid
selectedWeatherRow = df2.loc[(df2.WBANNO == sensorid) & (df2.LST_DATE == float(fwdate)) & (df2.LST_TIME >= float(fwtime)) & (df2.LST_TIME < ntime) ,['T_CALC', 'P_CALC']]
if len(selectedWeatherRow.index) == 0:
for sensId in sensorIds:
if sensId == sensorid:
continue
selectedWeatherRow = df2.loc[(df2.WBANNO == sensId) & (df2.LST_DATE == float(fwdate)) & (df2.LST_TIME >= float(fwtime)) & (df2.LST_TIME < ntime), ['T_CALC', 'P_CALC']]
if len(selectedWeatherRow.index) == 0:
continue
else:
tcal = selectedWeatherRow['T_CALC'].values[0]
pcal = selectedWeatherRow['P_CALC'].values[0]
usedSensorId = sensId
break
else:
tcal = selectedWeatherRow['T_CALC'].values[0]
pcal = selectedWeatherRow['P_CALC'].values[0]
row['TEMP'] = tcal
row['PREP'] = pcal
outdf.loc[index] = row
print index, tcal, pcal, fwdate, fwtime, ntime, usedSensorId
print "Loop completed"
outdf.to_csv("NYPD_TRAFFIC_DATA.csv")
print "file completed"
This program has been running for days. Not sure why dataframe is too slow. I rewrote the program without dataframe using dictionaries and it completed in a few minutes. Not sure if dataframe is slow or I am not using it correctly. Just posting here for learning.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to appy threading to make it run faster? - python

Related

How can I take the last value -1 pandas

How to create library of a self-written static function in python

How to update PySimpleGUI Listbox that reads an excel file

How do I get all the prices history with binance API for a crypto using Python?

Is python pandas dataframe too slow?

Categories

Resources