Use of function in pandas dataframe with multiprocessing - python

I am attempting to speed up calculations on a pandas DataFrame using multiprocessing which goes really well minus the fact that assigning the result of the calculation to the df.ix does not work here like it does in my code without trying multiprocessing here
I've added a #sanity check to the code which outputs valid values and would make me think this would work just fine, but the DataFrame doesn't get populated (stays as NaN). Does anyone know why that may be, and more importantly, what changes may be needed to plug the values into the DataFrame in the context of multiprocessing?
Output of sanity check:
should be setting df.ix[4][1] to: 23.2506112824
should be setting df.ix[0][0] to: 0.0
should be setting df.ix[7][0] to: 15.9574526264
code:
import mysql.connector
import numpy as np
from colormath.color_objects import LabColor
from colormath.color_diff import delta_e_cie2000
import pandas as pd
from mysql.connector.pooling import MySQLConnectionPool
from multiprocessing import Pool
pool = Pool()
cnx = mysql.connector.connect(user='user', password='pass',host='localhost', database='database')
cursor = cnx.cursor()
selectstmt = 'SELECT CIE_Lab, ID FROM `database`.`table`'
cursor.execute(selectstmt)
color = cursor.fetchall()
df = pd.DataFrame(columns = color, index = color)
sides = df.index
headers = df.dtypes.index
shape = df.shape[0]
def delta(cie_Lab1, cie_Lab2):
cie_Lab1 = cie_Lab1[1:]
cie_Lab1 = cie_Lab1[:-1]
cie_Lab2 = cie_Lab2[1:]
cie_Lab2 = cie_Lab2[:-1]
CIE_list1 = cie_Lab1.split(",")
CIE_list2 = cie_Lab2.split(",")
#print CIE_list1
CIE_L1 = CIE_list1[0]
CIE_a1 = CIE_list1[1]
CIE_b1 = CIE_list1[2]
CIE_L2 = CIE_list2[0]
CIE_a2 = CIE_list2[1]
CIE_b2 = CIE_list2[2]
color1 = LabColor(lab_l=CIE_L1, lab_a=CIE_a1, lab_b=CIE_b1)
color2 = LabColor(lab_l=CIE_L2, lab_a=CIE_a2, lab_b=CIE_b2)
deltae = delta_e_cie2000(color1, color2, Kl=1, Kc=1, Kh=1)
return deltae
def deltas(nums):
listoflists = []
for num in range(nums):
for mun in range(nums):
listoflists.append([num,mun])
return listoflists
def update(inp):
sides = df.index
headers = df.dtypes.index
num = inp[0]
mun = inp[1]
res = delta(headers[num][0], sides[mun][0])
#sanity check
print "should be setting df.ix["+str(mun)+"]["+str(num)+"] to: "+str(res)
df.ix[mun][num] = res
if __name__ == '__main__':
pool = Pool(4)
pool.map(update, deltas(shape))
pool.close()
pool.join()
print df
Dataframe example:
([69.62248143012944, -54.15108764844451, 67.92070706614288], 1) \
([69.62248143012944, -54.15108764844451, 67.920... NaN
([58.17848217611454, -52.251714243997995, 56.77... NaN
([87.02539335188214, -32.15758725885986, 66.450... NaN
([86.86259502866965, -31.483524711078015, 75.14... NaN
([85.39154525710671, -31.683349117376856, 71.35... NaN

Related

How to pass two or more dataframes from a module to main script

Edit with #RJ Adriaansen update:
I'm trying to pull two or more dataframes from a module so that I can use the data in the main script.
I only get 4 empty dataframes returned from the df_make module.
The main and df_make codes are below.
Any advice would be great thanks.
import pandas as pd
import df_make
df_trn = pd.DataFrame()
df_trn_trk = pd.DataFrame()
df_jky = pd.DataFrame()
df_jky_code = pd.DataFrame()
def main():
df_make.jky_trn(df_trn, df_trn_trk, df_jky, df_jky_code)
#df_make.jky_trn([df_trn])
print(df_trn)
print(df_trn_trk)
print(df_jky)
print(df_jky_code)
if __name__ == '__main__':
main()
import pandas as pd
#def jky_trn(df_trn):
def jky_trn(df_trn, df_trn_trk, df_jky, df_jky_code):
#global df_trn
#global df_trn_trk
#global df_jky
#global df_jky_code
path = (r"C:\Users\chris\Documents\UKHR\PythonSand\PY_Scripts\StackOF")
xls_tbl = "\Racecards.xlsx"
xls_link = path + xls_tbl
df1 = pd.read_excel(xls_link, usecols=["Jockey","Course","RaceDesc"])
df2 = pd.read_excel(xls_link, usecols=["Trainer","Course","RaceDesc"])
df1 = df1.drop_duplicates(subset=["Jockey","Course","RaceDesc"])
df1 = df1.dropna() # Remove rows with NaN
df1['Course'] = df1['Course'].str.replace(' \(AW\)', '') #Replace (AW) in Course
df2['Course'] = df2['Course'].str.replace(' \(AW\)', '')
df_jky = df1[['Jockey']].copy()
df_jky_code = df1[['Jockey', 'Course']].copy()
df_jky = df_jky.drop_duplicates()
df_jky_code = df_jky_code.drop_duplicates()
df_trn = df2[['Trainer']].copy()
df_trn_trk = df2[['Trainer', 'Course']].copy()
df_trn = df_trn.drop_duplicates()
df_trn_trk = df_trn_trk.drop_duplicates()
#print(df_jky_code)
#print(df_trn_trk)
return df_jky, df_jky_code, df_trn, df_trn_trk
So, it turns out that I needed to refer to the dataframes as a tuple item in the main script e.g. df_jt = df_make.jky_trn()
The new main script code is:
import pandas as pd
import df_make
def main():
df_jt = df_make.jky_trn()
print(df_jt[0])
print(df_jt[1])
print(df_jt[2])
print(df_jt[3])
if name == 'main':
main()

Python multiprocessing multiple iterations

I am trying to use multiprocessing to speed up my data processing. I am working on a machine with 6 Cores, so I want to iterate through a table of 12 million rows, and for each of these rows I iterate through several time steps doing a calculation (executing a function).
This line I would like to split up that it runs in parallel on different cores:
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
I tried something with
from multiprocessing import Pool
but I did not manage to pass the arguments of the function and the iterator.
I would appreciate any idea. I am new to Python.
This is what i have:
import pyreadr
import pandas as pd
import numpy as np
import time
from datetime import timedelta
import functools
from pathlib import Path
def read_data():
current_path = os.getcwd()
myfile = os.path.join(str(Path(current_path).parents[0]), 'dummy.RData')
result = pyreadr.read_r(myfile)
pc = result["pc"]
u = result["u"]
return pc, u
# add one column per time
def prepare_output_structure(pc):
ini_cols = pc.columns
pc = pc.reindex(columns=[*pc.columns, *np.arange(0, 11), 'cat'], fill_value=0)
pc.reset_index(level=0, inplace=True)
# print(pc.columns, pc.shape, pc.dtypes)
return pc, ini_cols
def conjunction(*conditions):
return functools.reduce(np.logical_and, conditions)
def timeloop(t_final: int, count_final: int, tipo):
if tipo == 'A':
count_ini = 35
else: # B:
count_ini = 30
yy_list = []
for t in np.arange(0, 11):
yy = ((count_final - count_ini) / t_final) * t + count_ini
yy_list.append(int(yy))
return yy_list
def rowiteration(i, output, ini_cols, cols):
c_2: bool = pc.loc[i, 'tipo'] == u.iloc[:, 0].str[:1] # first character of category e.g. 'A1'
c_5: bool = pc.loc[i, 't_final'] >= u.iloc[:, 1] # t_min (u)
c_6: bool = pc.loc[i, 't_final'] <= (u.iloc[:, 2]) # t_max (u)
pc.loc[i, 'cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
pc.iloc[i, (0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(pc.loc[i, 't_final']), int(pc.loc[i, 'count_final']), pc.loc[i, 'tipo'])
out = pd.DataFrame(pc.iloc[i, :])
out = pd.DataFrame(out.transpose(), columns=cols)
output = output.append(out.iloc[0, :])
return output
if __name__ == '__main__':
start_time = time.time()
pc, u = read_data()
nrowpc = len(pc.index)
a = np.arange(0, nrowpc) # filas tabla pc
# print(a, nrowpc, len(pc.index))
pc, ini_cols = prepare_output_structure(pc)
cols = pc.columns
output = pd.DataFrame()
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
pc2 = pd.concat(test, ignore_index=True)
pc2 = pc2.iloc[:, np.r_[5, (len(ini_cols)+1):(len(pc2.columns))]]
print(pc2.head)
elapsed_time_secs = time.time() - start_time
msg = "Execution took: %s secs (Wall clock time)" % timedelta(milliseconds=elapsed_time_secs)
print(msg)```
Replace your [rowiteration(i, output, ini_cols, cols) for i in a] with:
from multiprocessing import Pool
n_cpu = 10 # put in the number of threads of cpu
with Pool(processes=n_cpu) as pool:
ret = pool.starmap(rowiteration,
[(i, output, ini_cols, cols) for i in a])
Here is an approach that I think solves the problem and that only sends what is necessary to the worker processes. I haven't tested this as is (which would be difficult without the data your code reads in) but this is basic idea:
import multiprocessing as mp
p = mp.Pool(processes=mp.cpu_count())
# Note that you already define the static cols and ini_cols
# in global scope so you don't need to pass them to the Pool.
# ... Other functions you've defined ...
def rowiteration(row):
c_2: bool = row['tipo'] == u.iloc[:, 0].str[:1]
c_5: bool = row['t_final'] >= u.iloc[:, 1]
c_6: bool = row['t_final'] <= (u.iloc[:, 2])
row['cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
row[(0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(row['t_final']), int(row['count_final']), row['tipo'])
return row
out = []
for row in p.imap_unordered(rowiteration, [r for _, r in pc.iterrows()]):
row.index = cols
out.append(cols)
pc2 = pd.DataFrame(out, ignore_index=True)

Add a column to a dataframe in Python

I am trying to add a few columns to a dataframe - here is the code
import import_ipynb
import talib
import numpy
import yfinance as yf
import datetime as dt
import time
from datetime import datetime, timedelta
import sqlite3
import pandas
import numpy as np
conn = sqlite3.connect('Strategy_RSI_MACD_Data.db')
c = conn.cursor()
c.execute("select distinct Stock from Universe")
tickers = c.fetchall()
for row in tickers:
if row[0]:
ticker_list.append(row[0])
stockdetails = yf.download(
tickers = ticker_list,
period = '6mo',
interval = '1d',
group_by = 'ticker',
auto_adjust = False,
prepost = False,
threads = True,
proxy = None
)
df_ta = pandas.DataFrame(data = stockdetails['Adj Close'], dtype=numpy.float64)
stockdetails['RSI'] = df_ta.apply(lambda c: talib.RSI(c, timeperiod = 14))
The last line is throwing this error:
ValueError: Wrong number of items passed 505, placement implies 1
How can I fix this?
Your lambda function is returning 505 values whereas your assignment should have just one. Try converting the output into a list-
stockdetails['RSI'] = [df_ta.apply(lambda c: talib.RSI(c, timeperiod = 14))]
I figured it out!! - I needed to insert a loop that would loop through the values:
for row in tickers:
c.execute("select [Adj Close] from StockData where Symbol = ? ", (row))
AdjClose = c.fetchall()
df_ta = pd.DataFrame(data = AdjClose, dtype=numpy.float64)
df_ta = df_ta.apply(lambda c: talib.RSI(c, timeperiod = 14))

Overwriting one data with another data in pandas(dataframe)

Periodically (every 120 seconds) get data but recent data overwrites previous data in SQL DB. I want all data to be saved.In addition, is the timer correct?
import sqlalchemy as sa
import psycopg2
import requests as rq
import pandas as pd
import json
import time
start_time = time.time()
while True:
temp = pd.DataFrame()
df = pd.DataFrame()
vehicleList = {"SN63NBK", "YY67UTP"}
for ids in vehicleList:
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
r = r.text
temp = pd.read_json(r)
temp['Type'] = 'ids'
df = pd.concat([df, temp], sort=False).reset_index(drop=True)
engine = sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus_pg6', engine, if_exists='replace', index=False)
time.sleep(120.0 - ((time.time() - start_time) % 120.0))
I changed your code slightly, but I think the main problem is in if_exists parameter which you should set to append, as #K753 have mentioned in the comments.
Also, YY67UTP id returns nothing, so I replaced it with another random id from the site to illustrate how code works.
def _data_gen(vehicles):
""" Yields a dataframe for each request """
for ids in vehicles:
time.sleep(1)
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
temp = pd.read_json(r.text)
temp['Type'] = ids
yield temp
while True:
# how do you break from while loop if you need to?
vehicleList = {"SN63NBK", "YY67UTP"}
df = pd.concat(_data_gen(vehicleList), sort=False, ignore_index=True)
engine = sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus_pg6', engine, if_exists='append', index=False)
time.sleep(120)

python: using multiprocessing with a dataframe for geocoding

I am trying to return the zipcode as column in a dataframe. This code works but doesn't create a new column in the dataframe gps.
import geocoder
import multiprocessing as mp
import pandas as pd
google_key = 'key'
def reverse_gecode(coordinates):
return geocoder.google(coordinates, key = google_key, method = 'reverse').postal
if __name__ == '__main__':
gps = pd.DataFrame({'lat': [27.950575, 40.6936488],
'lon': [-82.4571776, -89.5889864]}) # dataframe mehtod
gps['gps'] = zip(gps.lat, gps.lon)
x = list(gps['gps'])
# multiprocessings
pool = mp.Pool(processes = (mp.cpu_count() - 1))
result_latlong = pool.map(reverse_gecode, x)
pool.close()
pool.join()
I have tried
gps['zip_code'] = gps.apply(lambda x: pool.map(reverse_gecode,
list(x[2])),
axis = 1)
gps['zip_code'] = gps.apply(lambda x: pool.map(reverse_gecode, x[2]),
axis = 1)
gps['zip_code'] = gps.apply(lambda x: pool.map(reverse_gecode, [x[0], x[1]]),
axis = 1)
But I just cannot get anything to work. The error I keep getting is
ValueError: ('Unknown location: 27.950575', u'occurred at index 0')
try is:
import geocoder
import multiprocessing as mp
import pandas as pd
def reverse_gecode(coordinates):
return geocoder.google(coordinates, method = 'reverse').postal
if __name__ == '__main__':
gps = pd.DataFrame({'lat': [27.950575, 40.6936488],
'lon': [-82.4571776, -89.5889864]}) # dataframe mehtod
coords = gps[['lat','lon']].astype(str).apply(lambda x: (x[0],x[1]), axis=1).tolist()
# multiprocessings
pool = mp.Pool(processes = (mp.cpu_count() - 1))
gps['zip_code'] = pool.map(reverse_gecode, coords)
print(gps)
pool.close()
pool.join()
PS i've removed key=google_key in the geocoder.google() call, because it didn't work for me
Output:
lat lon zip_code
0 27.950575 -82.457178 33602
1 40.693649 -89.588986 61603

Categories