I don't know if this is a good way to optimize, but basically I am using python inside a 3D app to create random colors per object. And the code I have works well with objects within 10k polygons. But it crashes in 100k polygons. Is there a way to do it by chunks in the loop, basically I have the for loop and using an if statement to filter the first 100. But then I need another 100, and another 100, etc. How can I write that? Maybe with a time sleep between each. It's not going to be faster but at least won't possible crash the program. Thanks.
for i, n in enumerate(uvShellIds):
#code can only perform well within sets of 100 elements
limit = 100 #?
if 0 <= i <= 100:
#do something
print(n)
# now I need it to work on a new set of 100 elements
#if 101 <= i <= 200:
#(...keep going between sets of 100...)
My current code :
import maya.OpenMaya as om
import maya.cmds as cmds
import random
def getUvShelList(name):
selList = om.MSelectionList()
selList.add(name)
selListIter = om.MItSelectionList(selList, om.MFn.kMesh)
pathToShape = om.MDagPath()
selListIter.getDagPath(pathToShape)
meshNode = pathToShape.fullPathName()
uvSets = cmds.polyUVSet(meshNode, query=True, allUVSets =True)
allSets = []
for uvset in uvSets:
shapeFn = om.MFnMesh(pathToShape)
shells = om.MScriptUtil()
shells.createFromInt(0)
# shellsPtr = shells.asUintPtr()
nbUvShells = shells.asUintPtr()
uArray = om.MFloatArray() #array for U coords
vArray = om.MFloatArray() #array for V coords
uvShellIds = om.MIntArray() #The container for the uv shell Ids
shapeFn.getUVs(uArray, vArray)
shapeFn.getUvShellsIds(uvShellIds, nbUvShells, uvset)
# shellCount = shells.getUint(shellsPtr)
shells = {}
for i, n in enumerate(uvShellIds):
#print(i,n)
limit = 100
if i <= limit:
if n in shells:
# shells[n].append([uArray[i],vArray[i]])
shells[n].append( '%s.map[%i]' % ( name, i ) )
else:
# shells[n] = [[uArray[i],vArray[i]]]
shells[n] = [ '%s.map[%i]' % ( name, i ) ]
allSets.append({uvset: shells})
for shell in shells:
selection_shell = shells.get(shell)
cmds.select(selection_shell)
#print(shells.get(shell))
facesSel = cmds.polyListComponentConversion(fromUV=True, toFace=True)
cmds.select(facesSel)
r = [random.random() for i in range(3)]
cmds.polyColorPerVertex(facesSel,rgb=(r[0], r[1], r[2]), cdo=1 )
cmds.select(deselect=1)
getUvShelList( 'polySurface359' )
You can use islice from itertools to chunk.
from itertools import islice
uvShellIds = list(range(1000))
iterator = iter(uvShellIds)
while True:
chunk = list(islice(iterator, 100))
if not chunk:
break
print(chunk) # chunk contains 100 elements you can process
I don't know how well it fits in your current code but, below is how you can process the chunks:
from itertools import islice
uvShellIds = list(range(1000))
iterator = iter(uvShellIds)
offset = 0
while True:
chunk = list(islice(iterator, 100))
if not chunk:
break
# Processing chunk items
for i, n in enumerate(chunk):
# offset + i will give you the right index referring to the uvShellIds variable
# Then , perform your actions
if n in shells:
# shells[n].append([uArray[i],vArray[i]])
shells[n].append( '%s.map[%i]' % ( name, offset + i ) )
else:
# shells[n] = [[uArray[i],vArray[i]]]
shells[n] = [ '%s.map[%i]' % ( name, offset + i ) ]
offset += 100
# Your sleep can come here
The snippet above should replace your for i, n in enumerate(uvShellIds): block.
As #David Culbreth's answer stated, I'm not sure the sleep will be of help, but I left a comment on where you can place it.
I use this generator to "chunkify" my long-running operations in python into smaller batches:
def chunkify_list(items, chunk_size):
for i in range(0, len(items), chunk_size):
yield items[i:i+chunk_size]
With this defined, you can write your program something like this:
items = [1,2,3,4,5 ...]
for chunk in chunkify_list(items, 100):
for item in chunk:
process_item(item)
sleep(delay)
Now, I'm not going to guarantee that sleep will actually solve your problems, but this lets you handle your data one chunk at a time.
Related
I have a program that I created using threads, but then I learned that threads don't run concurrently in python and processes do. As a result, I am trying to rewrite the program using multiprocessing, but I am having a hard time doing so. I have tried following several examples that show how to create the processes and pools, but I don't think it's exactly what I want.
Below is my code with the attempts I have tried. The program tries to estimate the value of pi by randomly placing points on a graph that contains a circle. The program takes two command-line arguments: one is the number of threads/processes I want to create, and the other is the total number of points to try placing on the graph (N).
import math
import sys
from time import time
import concurrent.futures
import random
import multiprocessing as mp
def myThread(arg):
# Take care of imput argument
n = int(arg)
print("Thread received. n = ", n)
# main calculation loop
count = 0
for i in range (0, n):
x = random.uniform(0,1)
y = random.uniform(0,1)
d = math.sqrt(x * x + y * y)
if (d < 1):
count = count + 1
print("Thread found ", count, " points inside circle.")
return count;
# end myThread
# receive command line arguments
if (len(sys.argv) == 3):
N = sys.argv[1] # original ex: 0.01
N = int(N)
totalThreads = sys.argv[2]
totalThreads = int(totalThreads)
print("N = ", N)
print("totalThreads = ", totalThreads)
else:
print("Incorrect number of arguments!")
sys.exit(1)
if ((totalThreads == 1) or (totalThreads == 2) or (totalThreads == 4) or (totalThreads == 8)):
print()
else:
print("Invalid number of threads. Please use 1, 2, 4, or 8 threads.")
sys.exit(1)
# start experiment
t = int(time() * 1000) # begin run time
total = 0
# ATTEMPT 1
# processes = []
# for i in range(totalThreads):
# process = mp.Process(target=myThread, args=(N/totalThreads))
# processes.append(process)
# process.start()
# for process in processes:
# process.join()
# ATTEMPT 2
#pool = mp.Pool(mp.cpu_count())
#total = pool.map(myThread, [N/totalThreads])
# ATTEMPT 3
#for i in range(totalThreads):
#total = total + pool.map(myThread, [N/totalThreads])
# p = mp.Process(target=myThread, args=(N/totalThreads))
# p.start()
# ATTEMPT 4
# with concurrent.futures.ThreadPoolExecutor() as executor:
# for i in range(totalThreads):
# future = executor.submit(myThread, N/totalThreads) # start thread
# total = total + future.result() # get result
# analyze results
pi = 4 * total / N
print("pi estimate =", pi)
delta_time = int(time() * 1000) - t # calculate time required
print("Time =", delta_time, " milliseconds")
I thought that creating a loop from 0 to totalThreads that creates a process for each iteration would work. I also wanted to pass in N/totalThreads (to divide the work), but it seems that processes take in an iterable list rather than an argument to pass to the method.
What is it I am missing with multiprocessing? Is it at all possible to even do what I want to do with processes?
Thank you in advance for any help, it is greatly appreciated :)
I have simplified your code and used some hard-coded values which may or may not be reasonable.
import math
import concurrent.futures
import random
from datetime import datetime
def myThread(arg):
count = 0
for i in range(0, arg[0]):
x = random.uniform(0, 1)
y = random.uniform(0, 1)
d = math.sqrt(x * x + y * y)
if (d < 1):
count += 1
return count
N = 10_000
T = 8
_start = datetime.now()
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {executor.submit(myThread, (int(N / T),)): _ for _ in range(T)}
total = 0
for future in concurrent.futures.as_completed(futures):
total += future.result()
_end = datetime.now()
print(f'Estimate for PI = {4 * total / N}')
print(f'Run duration = {_end-_start}')
A typical output on my machine looks like this:-
Estimate for PI = 3.1472
Run duration = 0:00:00.008895
Bear in mind that the number of threads you start is effectively managed by the ThreadPoolExecutor (TPE) [ when constructed with no parameters ]. It makes decisions about the number of threads that can run based on your machine's processing capacity (number of cores etc). Therefore you could, if you really wanted to, set T to a very high number and the TPE will block execution of any new threads until it determines that there is capacity.
I was experimenting with binary search, and when I got my version working I figured I would compare its speed to that of NumPy's. I was fairly surprised at the results, for two reasons.
I know that binary search should grow as log n, which mine did, but NumPy grew linearly.
Not only that, but NumPy was just plain slower -- at the start and certainly at the end.
I attached a graph of the results. Orange is NumPy and blue is mine. To the left is the time in milliseconds it took to find the last item in the list (items[-1]) and the bottom shows the length of the list. I have also checked to make sure that my code is returning the correct value and it is.
In case I wasn't clear, my questions are basically "why" two #1 and #2
#binary_search.py
from typing import Iterable
from numba import njit
from numba.typed import List
def _find(items: Iterable[int], to_find: int):
min = -1
max = len(items)
while True:
split = int((max+min)/2)
item = items[split]
if item == to_find:
return split
elif max == min:
print(min, max)
print(items)
print(to_find)
print(split)
exit()
elif item > to_find:
max = split - 1
elif item < to_find:
min = split + 1
def findsorted(_items: Iterable[int], to_find: int):
items = _items
return _find(items, to_find)
#graph_results.py
import binary_search as bs
import sys
import time
import numpy as np
from matplotlib import pyplot as plt
iterations = int(sys.argv[1])
items = [0, 1]
lx = []
ly = []
nx = []
ny = []
for i in range(2, iterations):
l_avg_times = []
n_avg_times = []
items.append(items[-1] + 1)
for _ in range(0, 100):
to_find = items[-1]
lstart = time.time()
bs.findsorted(items, to_find)
lend = time.time()
nstart = lend
np.searchsorted(items, to_find)
nend = time.time()
ltotal = lend-lstart
ntotal = nend-nstart
l_avg_times.append(ltotal)
n_avg_times.append(ntotal)
ly.append(
round(
sum(l_avg_times)/len(l_avg_times),
1000
)*1000
)
lx.append(i)
ny.append(
round(
sum(n_avg_times)/len(n_avg_times),
1000
)*1000
)
nx.append(i)
plt.plot(lx, ly)
plt.plot(nx, ny)
plt.show()
I am attempting to process multiple files at once, wherein each file will generate chunks of data to feed to a queue of a certain size limit simultaneously.
For instance, if there are 5 files, containing 1 million elements each, I would like to feed 100 elements from each of them to another generator which yields 500 elements at a time.
Here is what I have been trying so far, but am running into the can't pickle generator error:
import os
from itertools import islice
import multiprocessing as mp
import numpy as np
class File(object):
def __init__(self, data_params):
data_len = 100000
self.large_data = np.array([data_params + str(i) for i in np.arange(0, data_len)])
def __iter__(self):
for i in self.large_data:
yield i
def parse_file(file_path):
# differnt filepaths yeild different data obviously
# here we just emulate with something silly
if file_path == 'elephant_file':
p = File(data_params = 'elephant')
if file_path == 'number_file':
p = File(data_params = 'number')
if file_path == 'horse_file':
p = File(data_params = 'horse')
yield from p
def parse_dir(user_given_dir, chunksize = 10):
pool = mp.Pool(4)
paths = ['elephant_file', 'number_file', 'horse_file'] #[os.path.join(user_given_dir, p) for p in os.listdir(user_given_dir)]
# Works, but not simultaneously on all paths
# for path in paths:
# data_gen = parse_file(path)
# parsed_data_batch = True
# while parsed_data_batch:
# parsed_data_batch = list(islice(data_gen, chunksize))
# yield parsed_data_batch
# Doesn't work
for objs in pool.imap(parse_file, paths, chunksize = chunksize):
for o in objs:
yield o
it = parse_dir('.')
for ix, o in enumerate(it):
print(o) # hopefully just prints 10 elephants, horses and numbers
if ix>2: break
Anyone have any idea of how to obtain the desired behavior?
For pickle error:
parse_file is a generator, not a regular function, since it uses yield inside.
And multiprocessing requires a function as task to execute. So you should replace yield from p with return p in parse_file()
If you want fetch records in chunks from all files one by one, try using zip in parse_dir():
iterators = [
iter(e) for e in pool.imap(parse_file, paths, chunksize=chunksize)
]
while True:
batch = [
o for i in iterators
for _, o in zip(range(100), i) # e.g., 100
]
if batch:
yield batch
else:
return
I want to use multiprocessing in Python to speed up a while loop.
More specifically:
I have a matrix (samples*features). I want to select x subsets of samples whose values at a random subset of features is unequal to a certain value (-1 in this case).
My serial code:
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
no_feat = 500
no_samp = 5
no_trees = 5
i=0
iter=0
samples = np.zeros((no_trees, no_samp))
features = np.zeros((no_trees, no_feat))
while i < no_trees:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False)
iter_order = np.random.choice(df.shape[0], df.shape[0], replace=False)
samp_idx = []
a=0
#--------------
#how to run in parallel?
for j in iter_order:
pot_samp = df.iloc[j, rand_feat]
if len(np.where(pot_samp==-1)[0]) == 0:
samp_idx.append(j)
if len(samp_idx) == no_samp:
print a
break
a+=1
#--------------
if len(samp_idx) == no_samp:
samples[i,:] = samp_idx
features[i, :] = rand_feat
i+=1
iter+=1
if iter>1000: #break if subsets cannot be found
break
Searching for fitting samples is the potentially expensive part (the j for loop), which in theory can be run in parallel. In some cases, it is not necessary to iterate over all samples to find a large enough subset, which is why I am breaking out of the loop as soon as the subset is large enough.
I am struggling to find an implementation that would allow for checks of how many valid results are generated already. Is it even possible?
I have used joblib before. If I understand correctly this uses the pool methods of multiprocessing as a backend which only works for separate tasks? I am thinking that queues might be helpful but thus far I failed at implementing them.
I found a working solution. I decided to run the while loop in parallel and have the different processes interact over a shared counter. Furthermore, I vectorized the search for suitable samples.
The vectorization yielded a ~300x speedup and running on 4 cores speeds up the computation ~twofold.
First I tried to implement separate processes and put the results into a queue. Turns out these aren't made to store large amounts of data.
If someone sees another bottleneck in that code I would be glad if someone pointed it out.
With my basically nonexistent knowledge about parallel computing I found it really hard to puzzle this together, especially since the example on the internet are all very basic. I learnt a lot though =)
My code:
import numpy as np
import pandas as pd
import itertools
from multiprocessing import Pool, Lock, Value
from datetime import datetime
import settings
val = Value('i', 0)
worker_ID = Value('i', 1)
lock = Lock()
def findSamp(no_trees, df, no_feat, no_samp):
lock.acquire()
print 'starting worker - {0}'.format(worker_ID.value)
worker_ID.value +=1
worker_ID_local = worker_ID.value
lock.release()
max_iter = 100000
samp = []
feat = []
iter_outer = 0
iter = 0
while val.value < no_trees and iter_outer<max_iter:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False
#get samples with random features from dataset;
#find and select samples that don't have missing values in the random features
samp_rand = df.iloc[:,rand_feat]
nan_idx = np.unique(np.where(samp_rand == -1)[0])
all_idx = np.arange(df.shape[0])
notnan_bool = np.invert(np.in1d(all_idx, nan_idx))
notnan_idx = np.where(notnan_bool == True)[0]
if notnan_idx.shape[0] >= no_samp:
#if enough samples for random feature subset, select no_samp samples randomly
notnan_idx_rand = np.random.choice(notnan_idx, no_samp, replace=False)
rand_feat_rand = rand_feat
lock.acquire()
val.value += 1
#x = val.value
lock.release()
#print 'no of trees generated: {0}'.format(x)
samp.append(notnan_idx_rand)
feat.append(rand_feat_rand)
else:
#increase iter_outer counter if no sample subset could be found for random feature subset
iter_outer += 1
iter+=1
if iter >= max_iter:
print 'exiting worker{0} because iter >= max_iter'.format(worker_ID_local)
else:
print 'worker{0} - finished'.format(worker_ID_local)
return samp, feat
def initialize(*args):
global val, worker_ID, lock
val, worker_ID, lock = args
def star_findSamp(i_df_no_feat_no_samp):
return findSamp(*i_df_no_feat_no_samp)
if __name__ == '__main__':
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
df = df.fillna(-1)
df = df.iloc[:, 6:]
no_feat = 700
no_samp = 10
no_trees = 5000
startTime = datetime.now()
print 'starting multiprocessing'
ncores = 4
p = Pool(ncores, initializer=initialize, initargs=(val, worker_ID, lock))
args = itertools.izip([no_trees]*ncores, itertools.repeat(df), itertools.repeat(no_feat), itertools.repeat(no_samp))
result = p.map(star_findSamp, args)#, callback=log_result)
p.close()
p.join()
print '{0} sample subsets for tree training have been found'.format(val.value)
samples = [x[0] for x in result if x != None]
samples = np.vstack(samples)
features = [x[1] for x in result if x != None]
features = np.vstack(features)
print datetime.now() - startTime
I have a huge list (45M+ data poitns), with numerical values:
[78,0,5,150,9000,5,......,25,9,78422...]
I can easily get the maximum and minimum values, the number of these values, and the sum of them:
file_handle=open('huge_data_file.txt','r')
sum_values=0
min_value=None
max_value=None
for i,line in enumerate(file_handle):
value=int(line[:-1])
if min_value==None or value<min_value:
min_value=value
if max_value==None or value>max_value:
max_value=value
sum_values+=value
average_value=float(sum_values)/i
However, this is not what I need. I need a list of 10 numbers, where the number of data points between each two consecutive points is equal, for example
median points [0,30,120,325,912,1570,2522,5002,7025,78422]
and we have the number of data points between 0 and 30 or between 30 and 120 to be almost 4.5 million data points.
How can we do this?
=============================
EDIT:
I am well aware that we will need to sort the data. The problem is that I cannot fit all this data in one variable in memory, but I need to read it sequentially from a generator (file_handle)
If you are happy with an approximation, here is a great (and fairly easy to implement) algorithm for computing quantiles from stream data: "Space-Efficient Online Computation of Quantile Summaries" by Greenwald and Khanna.
The silly numpy approach:
import numpy as np
# example data (produced by numpy but converted to a simple list)
datalist = list(np.random.randint(0, 10000000, 45000000))
# converted back to numpy array (start here with your data)
arr = np.array(datalist)
np.percentile(arr, 10), np.percentile(arr, 20), np.percentile(arr, 30)
# ref:
# http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html
You can also hack something together where you just do like:
arr.sort()
# And then select the 10%, 20% etc value, add some check for equal amount of
# numbers within a bin and then calculate the average, excercise for reader :-)
The thing is that calling this function several times will slow it down, so really, just sort the array and then select the elements yourself.
As you said in the comments that you want a solution that can scale to larger datasets then can be stored in RAM, feed the data into an SQLlite3 database. Even if your data set is 10GB and you only have 8GB RAM a SQLlite3 database should still be able to sort the data and give it back to you in order.
The SQLlite3 database gives you a generator over your sorted data.
You might also want to look into going beyond python and take some other database solution.
Here's a pure-python implementation of the partitioned-on-disk sort. It's slow, ugly code, but it works and hopefully each stage is relatively clear (the merge stage is really ugly!).
#!/usr/bin/env python
import os
def get_next_int_from_file(f):
l = f.readline()
if not l:
return None
return int(l.strip())
MAX_SAMPLES_PER_PARTITION = 1000000
PARTITION_FILENAME = "_{}.txt"
# Partition data set
part_id = 0
eof = False
with open("data.txt", "r") as fin:
while not eof:
print "Creating partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "w") as fout:
for _ in range(MAX_SAMPLES_PER_PARTITION):
line = fin.readline()
if not line:
eof = True
break
fout.write(line)
part_id += 1
num_partitions = part_id
# Sort each partition
for part_id in range(num_partitions):
print "Reading unsorted partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "r") as fin:
samples = [int(line.strip()) for line in fin.readlines()]
print "Disk-Deleting unsorted {}".format(part_id)
os.remove(PARTITION_FILENAME.format(part_id))
print "In-memory sorting partition {}".format(part_id)
samples.sort()
print "Writing sorted partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "w") as fout:
fout.writelines(["{}\n".format(sample) for sample in samples])
# Merge-sort the partitions
# NB This is a very inefficient implementation!
print "Merging sorted partitions"
part_files = []
part_next_int = []
num_lines_out = 0
# Setup data structures for the merge
for part_id in range(num_partitions):
fin = open(PARTITION_FILENAME.format(part_id), "r")
next_int = get_next_int_from_file(fin)
if next_int is None:
continue
part_files.append(fin)
part_next_int.append(next_int)
with open("data_sorted.txt", "w") as fout:
while part_files:
# Find the smallest number across all files
min_number = None
min_idx = None
for idx in range(len(part_files)):
if min_number is None or part_next_int[idx] < min_number:
min_number = part_next_int[idx]
min_idx = idx
# Now add that number, and move the relevent file along
fout.write("{}\n".format(min_number))
num_lines_out += 1
if num_lines_out % MAX_SAMPLES_PER_PARTITION == 0:
print "Merged samples: {}".format(num_lines_out)
next_int = get_next_int_from_file(part_files[min_idx])
if next_int is None:
# Remove this partition, it's now finished
del part_files[min_idx:min_idx + 1]
del part_next_int[min_idx:min_idx + 1]
else:
part_next_int[min_idx] = next_int
# Cleanup partition files
for part_id in range(num_partitions):
os.remove(PARTITION_FILENAME.format(part_id))
My code a proposal for finding the result without needing much space. In testing it found a quantile value in 7 minutes 51 seconds for a dataset of size 45 000 000.
from bisect import bisect_left
class data():
def __init__(self, values):
random.shuffle(values)
self.values = values
def __iter__(self):
for i in self.values:
yield i
def __len__(self):
return len(self.values)
def sortedValue(self, percentile):
val = list(self)
val.sort()
num = int(len(self)*percentile)
return val[num]
def init():
numbers = data([x for x in range(1,1000000)])
print(seekPercentile(numbers, 0.1))
print(numbers.sortedValue(0.1))
def seekPercentile(numbers, percentile):
lower, upper = minmax(numbers)
maximum = upper
approx = _approxPercentile(numbers, lower, upper, percentile)
return neighbor(approx, numbers, maximum)
def minmax(list):
minimum = float("inf")
maximum = float("-inf")
for num in list:
if num>maximum:
maximum = num
if num<minimum:
minimum = num
return minimum, maximum
def neighbor(approx, numbers, maximum):
dif = maximum
for num in numbers:
if abs(approx-num)<dif:
result = num
dif = abs(approx-num)
return result
def _approxPercentile(numbers, lower, upper, percentile):
middles = []
less = []
magicNumber = 10000
step = (upper - lower)/magicNumber
less = []
for i in range(1, magicNumber-1):
middles.append(lower + i * step)
less.append(0)
for num in numbers:
index = bisect_left(middles,num)
if index<len(less):
less[index]+= 1
summing = 0
for index, testVal in enumerate(middles):
summing += less[index]
if summing/len(numbers) < percentile:
print(" Change lower from "+str(lower)+" to "+ str(testVal))
lower = testVal
if summing/len(numbers) > percentile:
print(" Change upper from "+str(upper)+" to "+ str(testVal))
upper = testVal
break
precision = 0.01
if (lower+precision)>upper:
return lower
else:
return _approxPercentile(numbers, lower, upper, percentile)
init()
I edited my code a bit and I now think that this way works at least decently even when it's not optimal.