How can I parallelize the following snippet of code in python? - python

I have a bunch of matrix multiplication operations that are performed only row-wise. I was wondering how to speed-up the computation by parallelization:
data = np.random.randint(1, 100, (100000, 800))
indices_1 = np.equal(data, 1)
A = np.zeros((100000, 100))
B = np.random.randn(800, 100)
for i in range(100000):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
I tried multiprocessor but for some reason, but it did not perform better than sequential. Here is my multiprocessor implementation:
from multiprocessing.pool import ThreadPool, Pool
pool = ThreadPool() # can also use Pool
def f(i):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
pool.map(f, range(100000))
Both yielded the same amount of running time (around 32 seconds). Other parallelization method like concurrent.futures did not improve the runtime (used like below):
with concurrent.futures.ThreadPoolExecutor() as executor:
result = executor.map(f, range(100000))
I also tried to apply dask but could not make their framework work in my case. Any help will be much appreciated! Thanks!

import numpy as np
import multiprocessing as mp
data = list(np.random.randint(1, 100, (100000, 800)))
indices_1 = np.equal(data, 1)
A = list(np.zeros((100000, 100)))
B = np.random.randn(800, 100)
def f(data, A, i):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
with mp.Manager() as manager:
data_global = manager.list(data)
A_global = manager.list(A)
with mp.Pool() as p:
results = [ p.apply_async(f, (data_global, A_global, i,)) for i in range(100000) ]
for i in results:
i.wait()
data_global = list(data_global)
A_global = list(A_global)

Related

How to make this task improve cpu usage?

I try to hash many file, but it not use full of cpu power. it only consume 25%. i test to move the heavy process into thread. but still no different. im from nodejs use sharp library. with same task. it consume all cpu usage. How python to make it full power?
import cv2
import math
import datetime
import hashlib
import threading
def thread_function(image, yPos, xPos, wSizeBlock, hSizeBlock):
block = image[yPos:yPos+wSizeBlock, xPos:xPos+hSizeBlock]
hash = hashlib.sha256()
hash.update(block.tobytes())
print(hash.hexdigest())
image = cv2.imread('frame323.jpg', cv2.IMREAD_COLOR)
dimension = {
'width': image.shape[1],
'height': image.shape[0]
}
wSizeBlock = int(16)
hSizeBlock = int(16)
wBlockLength = math.floor(dimension['width'] / wSizeBlock)
hBlockLength = math.floor(dimension['height'] / hSizeBlock)
count = 0
start_time = datetime.datetime.now()
print(start_time)
for k in range(0, 500):
for i in range(0, wBlockLength):
for j in range(0, hBlockLength):
xPos = int(i*wSizeBlock)
yPos = int(j*hSizeBlock)
x = threading.Thread(target=thread_function, args=(image, xPos, yPos, wSizeBlock, hSizeBlock))
x.start()
count += 1
count = 0
end_time = datetime.datetime.now()
print(end_time)
For CPU intensive operations that can be split up into smaller tasks, you would want to use the multiprocessing module. It is similar to the threading module in that it allows multiple functions to be ran at once. Syntax looks something like this:
import multiprocessing as mp
def add(a, b):
return a + b
p = mp.Process(target=add, args=(1, 2))
p.start()

MyHDL Signals inside functions not showing up in VCD

Should I be able to see these in the generated VCD file?
#always(clk.posedge)
def MentorCluster():
j = Signal(0)
mentorq0, mentorq1, mentorq2, mentorq3 = [[Signal(0) for j in range(10)] for i in range(4)]
I can see all the signals I created at the top level, but not the ones local to the function
Here is the code I used to generate the VCD:
def simulate(timesteps):
traceSignals.timescale = "1ps"
tb = traceSignals(avaca)
sim = Simulation(tb)
sim.run(timesteps)
sim.quit()
#simulate for 2000 ticks (picoseconds) -- very ambitious to do all this in 2ns!
simulate(2000)
Signals created inside an `always’ will not only not show up in the .vcd, but also won’t work.
Here is a small test program to try this.
'''
delay, StopSimulation)
#block
def f1(clk, sigin, sigout):
# this is the place to declare Signals and ListOfSignals
sigind1 = Signal(intbv(0)[4:])
mentorq0, mentorq1, mentorq2, mentorq3 = [[Signal(bool(0)) for j in range(10)] for i in range(4)]
#always_seq(clk.posedge, reset=None)
def f1s():
# declaring Signals and ListOfSignals won't work
# sigind1 = Signal(intbv(0)[4:])
# mentorq0, mentorq1, mentorq2, mentorq3 = [[Signal(bool(0)) for j in range(10)] for i in range(4)]
sigind1.next = sigin
mentorq0[0].next = sigind1[0]
mentorq1[0].next = sigind1[1]
mentorq2[0].next = sigind1[2]
mentorq3[0].next = sigind1[3]
for i in range(1, 10):
mentorq0[i].next = mentorq0[i - 1]
mentorq1[i].next = mentorq1[i - 1]
mentorq2[i].next = mentorq2[i - 1]
mentorq3[i].next = mentorq3[i - 1]
sigout.next[0] = mentorq0[9]
sigout.next[1] = mentorq1[9]
sigout.next[2] = mentorq2[9]
sigout.next[3] = mentorq3[9]
return f1s
if __name__ == '__main__':
import random
random.seed = 'We want repeatable randomness'
#block
def tb_f1():
clk = Signal(bool(0))
sigin = Signal(intbv(0)[4:])
sigout = Signal(intbv(0)[4:])
tCK = 10
dut = f1(clk, sigin, sigout)
#instance
def genclk():
while True:
clk.next = not clk
yield delay(int(tCK // 2))
#instance
def stimulus():
yield delay(int(tCK * 3.5))
for __ in range(10):
sigin.next = random.randint(1, 15)
yield delay(tCK)
yield delay(tCK * 20)
raise StopSimulation
return instances()
# finally
dft = tb_f1()
dft.config_sim(trace=True)
dft.run_sim()
When we activate the Signals inside the always block sigout will remain 0

Python multiprocessing multiple iterations

I am trying to use multiprocessing to speed up my data processing. I am working on a machine with 6 Cores, so I want to iterate through a table of 12 million rows, and for each of these rows I iterate through several time steps doing a calculation (executing a function).
This line I would like to split up that it runs in parallel on different cores:
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
I tried something with
from multiprocessing import Pool
but I did not manage to pass the arguments of the function and the iterator.
I would appreciate any idea. I am new to Python.
This is what i have:
import pyreadr
import pandas as pd
import numpy as np
import time
from datetime import timedelta
import functools
from pathlib import Path
def read_data():
current_path = os.getcwd()
myfile = os.path.join(str(Path(current_path).parents[0]), 'dummy.RData')
result = pyreadr.read_r(myfile)
pc = result["pc"]
u = result["u"]
return pc, u
# add one column per time
def prepare_output_structure(pc):
ini_cols = pc.columns
pc = pc.reindex(columns=[*pc.columns, *np.arange(0, 11), 'cat'], fill_value=0)
pc.reset_index(level=0, inplace=True)
# print(pc.columns, pc.shape, pc.dtypes)
return pc, ini_cols
def conjunction(*conditions):
return functools.reduce(np.logical_and, conditions)
def timeloop(t_final: int, count_final: int, tipo):
if tipo == 'A':
count_ini = 35
else: # B:
count_ini = 30
yy_list = []
for t in np.arange(0, 11):
yy = ((count_final - count_ini) / t_final) * t + count_ini
yy_list.append(int(yy))
return yy_list
def rowiteration(i, output, ini_cols, cols):
c_2: bool = pc.loc[i, 'tipo'] == u.iloc[:, 0].str[:1] # first character of category e.g. 'A1'
c_5: bool = pc.loc[i, 't_final'] >= u.iloc[:, 1] # t_min (u)
c_6: bool = pc.loc[i, 't_final'] <= (u.iloc[:, 2]) # t_max (u)
pc.loc[i, 'cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
pc.iloc[i, (0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(pc.loc[i, 't_final']), int(pc.loc[i, 'count_final']), pc.loc[i, 'tipo'])
out = pd.DataFrame(pc.iloc[i, :])
out = pd.DataFrame(out.transpose(), columns=cols)
output = output.append(out.iloc[0, :])
return output
if __name__ == '__main__':
start_time = time.time()
pc, u = read_data()
nrowpc = len(pc.index)
a = np.arange(0, nrowpc) # filas tabla pc
# print(a, nrowpc, len(pc.index))
pc, ini_cols = prepare_output_structure(pc)
cols = pc.columns
output = pd.DataFrame()
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
pc2 = pd.concat(test, ignore_index=True)
pc2 = pc2.iloc[:, np.r_[5, (len(ini_cols)+1):(len(pc2.columns))]]
print(pc2.head)
elapsed_time_secs = time.time() - start_time
msg = "Execution took: %s secs (Wall clock time)" % timedelta(milliseconds=elapsed_time_secs)
print(msg)```
Replace your [rowiteration(i, output, ini_cols, cols) for i in a] with:
from multiprocessing import Pool
n_cpu = 10 # put in the number of threads of cpu
with Pool(processes=n_cpu) as pool:
ret = pool.starmap(rowiteration,
[(i, output, ini_cols, cols) for i in a])
Here is an approach that I think solves the problem and that only sends what is necessary to the worker processes. I haven't tested this as is (which would be difficult without the data your code reads in) but this is basic idea:
import multiprocessing as mp
p = mp.Pool(processes=mp.cpu_count())
# Note that you already define the static cols and ini_cols
# in global scope so you don't need to pass them to the Pool.
# ... Other functions you've defined ...
def rowiteration(row):
c_2: bool = row['tipo'] == u.iloc[:, 0].str[:1]
c_5: bool = row['t_final'] >= u.iloc[:, 1]
c_6: bool = row['t_final'] <= (u.iloc[:, 2])
row['cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
row[(0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(row['t_final']), int(row['count_final']), row['tipo'])
return row
out = []
for row in p.imap_unordered(rowiteration, [r for _, r in pc.iterrows()]):
row.index = cols
out.append(cols)
pc2 = pd.DataFrame(out, ignore_index=True)

Python multiprocessing not running

I tried to use multiprocessing on this for loop:
def segment_features(segment_pixels):
features = []
npixels, nbands = segment_pixels.shape
for b in range(nbands):
stats = scipy.stats.describe(segment_pixels[:, b])
band_stats = list(stats.minmax) + list(stats)[2:]
if npixels == 1:
# in this case the variance = nan, change it 0.0
band_stats[3] = 0.0
features += band_stats
return features
segment_ids = np.unique(segments)
objects = []
object_ids = []
for id in segment_ids:
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
By replacing the for loop section with this:
def segment_features_calc(segment_pixels):
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
print("segment "+str(id)+" features calculated")
n=mp.cpu_count()
if__name__ == '__main__':
p = mp.Pool(processes = n)
start = time.time()
async_result = p.map_async(segment_features_calc,list(segment_ids))
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
However, the multiprocessing does not get executed properly (multiprocessing stops after 0.17 seconds whereas the items to loop are close to 270,000+ segment IDs). Any insights on how to solve the issue?

Multiprocessing - memory consumption

I've read all the related posts on the subject, but I can't for the life of me get multiprocessing to work properly with shared memory.
I'm using an EC2 instance with 96 cores, but for some reason despite using shared memory, my memory consumption explodes when using a worker pool with 96 workers.
EDIT: Had a bug earlier, which caused not all the cores to be used (stupid bug where I didn't give the right parameters for map) - anyways, clarified my current problem.
Any ideas? Attaching a screenshot of htop on my server to show the CPU usage + memory consumption.
For reference, I used the figtree package from here: https://github.com/ec2604/figtree (commit - 7ba197e45a5c6577fab56d469b4b1ccf02242e3d), it's a forked repository that ports C level code to python. Don't think it should really matter, you can plop any CPU intensive code in there instead.
!!!!!!EDIT!!!!: In hindsight, the figtree package allocates memory for the result (5000099958) / (1024**3) GB per process. If you multiply that by 96 processes this is what causes the insane memory consumption.
import figtree
import numpy as np
import multiprocessing
import ctypes
from multiprocessing import Pool, sharedctypes
n = 50000
m = 9995
X_base = sharedctypes.RawArray(ctypes.c_double, n* 77)
X_shared = np.frombuffer(X_base.get_obj())
X_shared = X_shared.reshape(n, 77)
X_shared[:] = np.random.normal(0, 1, (n, 77))
del X_shared
Q_base = sharedctypes.RawArray(ctypes.c_double, m** 2)
Q_shared = np.frombuffer(Q_base.get_obj())
Q_shared = Q_shared.reshape(m, m)
Q_shared[:] = np.random.normal(0, 1, (m, m))
del Q_shared
def fig_helper_efficient(slice):
print(id(Q_shared))
Q_shared = np.frombuffer(Q_base)
Q_shared = Q_shared.reshape(9995, 9995)
X_shared = np.frombuffer(X_base)
X_shared = X_shared.reshape(n,77)
if Q_shared.shape[0] == Q_shared.shape[1]:
res = figtree.figtree(**{'X': X_shared[slice, :], 'Y': X_shared,
'Q': Q_shared[:, slice].copy(), 'epsilon': 1e-12,
'h': 15})
print("done")
return res
def divide_batches_equally(num_examples, num_batches):
div_result = num_examples // num_batches
mod_result = num_examples % num_batches
size = np.zeros((num_batches + 1, 1)).astype(np.int32)
size[1:] = div_result
if mod_result > 0:
size[1:mod_result + 1] += 1
return np.cumsum(size)
def parallel_fig_vert_efficient():
n_proc = 96
size = divide_batches_equally(m, n_proc)
parallel_list = [slice(int(size[i]), int(size[i + 1])) for i in range(n_proc)]
with Pool(n_proc) as pool:
res = pool.map(fig_helper_efficient, parallel_list)
return res
if __name__ == '__main__':
parallel_fig_vert_efficient()

Categories