Not getting a sorted list in python - python

I have a list of process objects(user-defined) that I want to sort to get the most memory-intensive processes at a time.
But reverse-sorting through the sorted is not yielding the required result.
My code:
import psutil as pu
import time
class proc:
def __init__(self,pid,pname,pmem):
self.pid = pid
self.pname = pname
self.pmem = int(pmem)
# def __lt__(self,other):
# return self.pmem<other.pmem
# def __repr__(self):
# return str(self.pmem)+"\t"+self.pname
if __name__ == "__main__":
meg = 1024*1024
gig = meg*1024
while True:
print(pu.cpu_count())
print(pu.cpu_percent())
print("{:.3f} GB".format(pu.virtual_memory().used/gig))
x = []
for p in pu.pids():
pro = pu.Process(pid=p)
# print(pro.memory_info()[0])
# print(pro.memory_info()[1])
x.append(proc(pid=p,pname=pro.name(),pmem=pro.memory_info()[0]))
sorted(x,key=lambda x:x.pmem,reverse=True)
for i in x:
print(str(i.pmem)+'\t'+i.pname)
time.sleep(5)
Output:
http://pastebin.com/7Pz5Yn7A

You should use sort instead of sorted:
x.sort(key=lambda item: item.pmem, reverse=True)
sort sorts existing list; sorted creates a new one.

Related

Using multiprocessing to double the speed of working on a list

Let's say I have a list like this:
list_base = ['a','b','c','d']
If I used for xxx in list_base:, the loop would parse the list one value at a time. If I want to double the speed of this work, I'm creating a list with two values to iterate over at once and calling multiprocessing.
Basic example
Code 1 (main_code.py):
import api_values
if __name__ == '__main__':
list_base = ['a','b','c','d']
api_values.main(list_base)
Code 2 (api_values.py):
import multiprocessing
import datetime
def add_hour(x):
return str(x) + ' - ' + datetime.datetime.now().strftime('%d/%m/%Y %H:%M')
def main(list_base):
a = list_base
a_pairs = [a[i:i+2] for i in range(0, len(a)-1, 2)]
if (len(a) % 2) != 0:
a_pairs.append([a[-1]])
final_list = []
for a, b in a_pairs:
mp_1 = multiprocessing.Process(target=add_hour, args=(a,))
mp_2 = multiprocessing.Process(target=add_hour, args=(b,))
mp_1.start()
mp_2.start()
mp_1.join()
mp_2.join()
final_list.append(mp_1)
final_list.append(mp_2)
print(final_list)
When I analyze the final_list print it delivers values like this:
[
<Process name='Process-1' pid=9564 parent=19136 stopped exitcode=0>,
<Process name='Process-2' pid=5400 parent=19136 stopped exitcode=0>,
<Process name='Process-3' pid=13396 parent=19136 stopped exitcode=0>,
<Process name='Process-4' pid=5132 parent=19136 stopped exitcode=0>
]
I couldn't get to the return values I want conquered by calling the add_hour(x) function.
I found some answers in this question:
How can I recover the return value of a function passed to multiprocessing.Process?
But I couldn't bring to the scenario I'm using where I need the multiprocessing inside a function and not inside if __name__ == '__main__':
When trying to use it, it always generates errors in relation to the position of the created code structure, I would like some help to be able to visualize the use for my need.
Note:
This codes are a basic's examples, my real use is to extract data from an API that allows for a maximum of two simultaneous calls.
Additional code:
According to #Timus comment (You might want to look into a **Pool** and **.apply_async**), I came to this code it seems to me it worked but I don't know if it is reliable, if there is any improvement that is necessary for its use and this option is the best, feel free to update in a answer:
import multiprocessing
import datetime
final_list = []
def foo_pool(x):
return str(x) + ' - ' + datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')
def log_result(result):
final_list.append(result)
def main(list_base):
pool = multiprocessing.Pool()
a = list_base
a_pairs = [a[i:i+2] for i in range(0, len(a)-1, 2)]
if (len(a) % 2) != 0:
a_pairs.append([a[-1]])
for a, b in a_pairs:
pool.apply_async(foo_pool, args = (a, ), callback = log_result)
pool.apply_async(foo_pool, args = (b, ), callback = log_result)
pool.close()
pool.join()
print(final_list)
You don't have to use a callback: Pool.apply_async() gives you a return (an AsyncResult object) which has a .get() method to retrieve the result of the submit. Extension of your attempt:
import time
import multiprocessing
import datetime
from os import getpid
def foo_pool(x):
print(getpid())
time.sleep(2)
return str(x) + ' - ' + datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')
def main(list_base):
a = list_base
a_pairs = [a[i:i+2] for i in range(0, len(a)-1, 2)]
if (len(a) % 2) != 0:
a_pairs.append([a[-1]])
final_list = []
with multiprocessing.Pool(processes=2) as pool:
for a, b in a_pairs:
res_1 = pool.apply_async(foo_pool, args=(a,))
res_2 = pool.apply_async(foo_pool, args=(b,))
final_list.extend([res_1.get(), res_2.get()])
print(final_list)
if __name__ == '__main__':
list_base = ['a','b','c','d']
start = time.perf_counter()
main(list_base)
end = time.perf_counter()
print(end - start)
I have added the print(getpid()) to foo_pool to show that you're actually using different processes. And I've used time to illustrate that despite the time.sleep(2) in foo_pool the overall duration of main isn't much more than 2 seconds.
I think you need shared strings between processes. They can be obtained from multiprocessing.Manager().
Your api_values.py should look like this:
import multiprocessing
import datetime
from ctypes import c_wchar_p
def add_hour(x, ret_str):
ret_str.value = str(x) + ' - ' + datetime.datetime.now().strftime('%d/%m/%Y %H:%M')
def main(list_base):
a = list_base
a_pairs = [a[i:i+2] for i in range(0, len(a)-1, 2)]
if (len(a) % 2) != 0:
a_pairs.append([a[-1]])
final_list = []
manager = multiprocessing.Manager()
for a, b in a_pairs:
ret_str_a = manager.Value(c_wchar_p, "")
ret_str_b = manager.Value(c_wchar_p, "")
mp_1 = multiprocessing.Process(target=add_hour, args=(a, ret_str_a))
mp_2 = multiprocessing.Process(target=add_hour, args=(b, ret_str_b))
mp_1.start()
mp_2.start()
mp_1.join()
mp_2.join()
final_list.append(ret_str_a.value)
final_list.append(ret_str_b.value)
print(final_list)
Source: How to share a string amongst multiple processes using Managers() in Python?

ProcessPoolExecutor not working for function with multiple arguments - python

I have a dataframe df_full that I am trying to rewrite as a dict() while also doing some stuff over it.
agent locations modal_choices
0 agent_1 'loc1', 'loc2', 'loc3', 'loc2' 'mode_1', 'mode_1', 'mode_2', 'mode_3'
1 agent_2 'loc1', 'loc4', 'loc2', 'loc6' 'mode_2', 'mode_3', 'mode_2', 'mode_3'
I am currently facing a problem while trying to multiprocess the following function format_dict() knowing that I only want to iterate over the agent argument, the three others are supposed to be the same for each iterations. So I added the partial() parameter to "freeze" df, dict_ and list_ but the code returns me an empty dict and an empty list by the end and I don't understand why.
I suppose I haven't written the executor.map() properly. I tried following the methods shown here but it still doesn't return anything.
What could be wrong with my code?
I also printed the time taken by the following script to run with time.perf_counter() and compared it with what is given with tqdm() but the two values don't match. The iteration part is done in 7 seconds (tqdm) while the print of time.perf_counter() shows up after 2.3 minutes.
What would explain the delay for the ending of the with concurrent.futures.ProcessPoolExecutor() as executor:?
I am, unfortunately, still not an expert in python and this is the first time I'm trying to multiprocess something (as the agent list I am working with is massive and would take days to process...). Any help would be greatly appreciated! And please do tell me if informations are missing or if something is not explained properly, I'll edit the post right away.
def format_dict(agent, df, dict_, list_):
try:
dict_[agent] = dict()
toto_ = df.loc[df.agent_ID == agent]
toto_mod = toto_['modal_choices'].apply(lambda x: pd.Series(x.split(',')))
toto_loc = toto_['locations'].apply(lambda x: pd.Series(x.split(',')))
for i in toto_mod:
dict_[agent]['step_{}'.format(i)] = dict()
dict_[agent]['step_{}'.format(i)]['mode'] = toto_mod[i].iloc[0]
dict_[agent]['step_{}'.format(i)]['start'] = toto_loc[counter + 1].iloc[0]
dict_[agent]['step_{}'.format(i)]['name'] = dict_agent_edt[agent]['step_0']['name']
except ValueError:
list_.append(agent)
return dict_, list_
dict_name = dict()
list_name = list()
start = time.perf_counter()
agent = df_full['agent'][:1000]
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(partial(format_dict, df=df_full, dict_=dict_name, list_=list_name),
tqdm(agent), chunksize=50)
end = time.perf_counter()
print(f'It took {(end-start)/60} minutes.')
Following #Louis Lac's answer, I modified my script to avoid any concurrence but it still returns an empty dict.
def format_dict(agent, df, dict_):
try:
dict_[agent] = dict()
toto_ = df.loc[df.agent_ID == agent]
(same stuff here)
except ValueError:
pass
return dict_
start = time.perf_counter()
agents = df_full['agent'][:1000]
dict_name = {}
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(partial(format_dict, df=df_full, dict_=dict_name),
tqdm(agents), chunksize=50)
end = time.perf_counter()
print(f'It took {(end-start)/60} minutes.')
When using concurrency such as multithreading and multiprocessing, functions that are executed concurrently such as format_dict should not mutate shared state to avoid data races or the mutations should be synchronized.
You could for instance compute all your stuff concurrently first, then sequentially reduce the result into outputs (dict_ and list_):
def format_dict(agent, df):
list_ = None
try:
dict_ = dict()
toto_ = df.loc[df.agent_ID == agent]
toto_mod = toto_['modal_choices'].apply(lambda x: pd.Series(x.split(',')))
toto_loc = toto_['locations'].apply(lambda x: pd.Series(x.split(',')))
for i in toto_mod:
dict_['step_{}'.format(i)] = dict()
dict_['step_{}'.format(i)]['mode'] = toto_mod[i].iloc[0]
dict_['step_{}'.format(i)]['start'] = toto_loc[counter + 1].iloc[0]
dict_['step_{}'.format(i)]['name'] = dict_agent_edt[agent]['step_0']['name']
except ValueError:
list_ = agent
return dict_, list_
start = time.perf_counter()
agents = df_full['agent'][:1000]
with concurrent.futures.ProcessPoolExecutor() as executor:
elements = executor.map(partial(format_dict, df=df_full),
tqdm(agents), chunksize=50)
dict_ = {}
list_ = []
for agent, (d, l) in zip(agents, elements):
if l is not None:
list_.append(l)
dict_[agent] = d
end = time.perf_counter()
print(f'It took {(end-start)/60} minutes.')

ThreadPool with list of classes and member function

I have a function that I would like to make multiThreaded with pool.
def find(item):
curve=Curve(item)
return curve._find()
The multi-threaded version wouuld check if the input is a list:
def find(item):
if type(item) == list:
items=item
pool = ThreadPool(len(items))
curves = pool.map(Curve, moniker)
pool.close()
pool = ThreadPool(len(items))
# now comes the tricky part:
results = pool.map(???) # curves is a list of class
# with each having _find as a function
pool.close()
return results
else:
curve=Curve(item)
return curve._find()
How can I call pool.map with a list of classes as described above?
If i did understand, you just need to declare a function to map over the items of the list:
def find(item):
def find_(item):
curve=Curve(item)
return curve._find()
if type(item) == list:
items=item
pool = ThreadPool(len(items))
# now comes the tricky part:
results = pool.map(find_, items) # curves is a list of class
# with each having _find as a function
pool.close()
return results
else:
return find_(item)

Python: List is replaced by the new list

I got a question regarding list within python. I use the append method to actually append values to my list, now it only replaced the list with new values.
This is my code:
def init(serial):
serial_number = serial
api_call = "http://wwww.herecomesmyhyperlink/"+serial_number
result = []
with open('allserials.csv') as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
for row in reader:
if row[0].strip() == api_call:
result.append(row[1].strip())
call_api(serial_number,result)
return
def call_api(get_serial,get_result):
list_serial = []
for i in range(len(get_result)):
# do an api call
....
# get result of api call
list_serial.append(api_result)
sort_serials(list_serial)
return
def sort_serials(get_list_serial)
sorted_list_serial = sorted(get_list_serial, reverse=True)
print(sorted_list_serial)
max_results = 10
length_of_sorted_list_serial = len(get_list_serial)
if length_of_sorted_list_serial < max_results:
get_first_list_element = sorted_list_serial[0]
get_second_element_of_that_list = get_first_list_element[1]
init(get_second_element_of_that_list)
else:
print("it is not smaller")
return
print(init('1320739'))
sorted_list_serial would contain something like: [rankingid,serial,title].
get_second_element_of_that_list: [serial]
The thing is that when I run my code I got the following results:
s: 1320739, max result:10 length of the list:3
s: 1523039, max result:10 length of the list:9
What the code does is that instead of having a list of 12 items, it replace the list with the 3 items with the new list of 9 items.
What I want is to actually have a new list containing 12 items, so that the first 3 items are still within the list and the 9 other elements are added to the original list.
The list is scoped to the function call_api() so pull it out, or pass it to each function, or create a class.
def init(serial):
serial_number = serial
result = []
with open('allserials.csv') as csvfile:
result.append()
return result
def call_api(get_serial,get_result):
# list_serial = []
#
# Move this out
# Or pass it along to each function
for i in range(len(get_result)):
# do an api call
....
# get result of api call
list_serial.append(api_result)
return list_serial
def sort_serials(get_list_serial)
sorted_list_serial = sorted(get_list_serial, reverse=True)
max_results = 10
length_of_sorted_list_serial = len(get_list_serial)
if length_of_sorted_list_serial < max_results:
get_first_list_element = sorted_list_serial[0]
get_second_element_of_that_list = get_first_list_element[1]
else:
print("it is not smaller")
return {'get_second_element_of_that_list':get_second_element_of_that_list, 'sorted_serial_list':sorted_serial_list}
So scope it to the same function, and have the other functions return results:
def run():
list_serial = []
serial_number = '1320739'
result = init(serial_number)
# here the items get set
list_serial = call_api(serial_number,result)
# here they get sorted
serial_sorted = sort_serials(list_serial)
# list serial is now the sorted list
list_serial = serial_sorted['sorted_serial_list']
get_second_element_of_that_list = serial_sorted['get_second_element_of_that_list']
init(get_second_element_of_that_list)
Or redefine how its passed:
serial_number = '1320739'
init(serial_number, list_serial)
call_api(serial_number,result, list_serial)
sort_serials(list_serial)
init(get_second_element_of_that_list, list_serial)
Or just pull it out:
.
.
.
list_serial = []
print(init('1320739'))
Or create a class:
class SomeClassNameHere(object):
def __init__(self,serialnumber=None, item2=''):
self.serialnumber = serialnumber
self.item3 = item2
self.listserial = []
self.run(item2)
def doOtherStuff(self):
# self.listserial will be updated
self.listserial = [1,2,3]
print(self.item3)
print(self.serialnumber)
def run(self,passeditem2):
print('Item2 has been passed: {0}'.format(passeditem2))
print('listserial not updated:',self.listserial)
self.doOtherStuff()
print('listserial updated:',self.listserial)
here = SomeClassNameHere(serialnumber='456',item2='somestring')
print(here.serialnumber)
print(here.item3)
here.run()
here.doOtherStuff()

Print progress of pool.map_async

I have the following function
from multiprocessing import Pool
def do_comparison(tupl):
x, y = tupl # unpack arguments
return compare_clusters(x, y)
def distance_matrix(clusters, condensed=False):
pool = Pool()
values = pool.map_async(do_comparison, itertools.combinations(clusters, 2)).get()
do stuff
Is it possible to print the progress of pool.map_async(do_comparison, itertools.combinations(clusters, 2)).get()?
I tried it by adding a count to do_comparison like so
count = 0
def do_comparison(tupl):
global count
count += 1
if count % 1000 == 0:
print count
x, y = tupl # unpack arguments
return compare_clusters(x, y)
But aside from it not looking like a good solution, the numbers don't print until the end of the script. Is there a good way to do this?
I track progress as follows:
import multiprocessing
import time
class PoolProgress:
def __init__(self,pool,update_interval=3):
self.pool = pool
self.update_interval = update_interval
def track(self, job):
task = self.pool._cache[job._job]
while task._number_left>0:
print("Tasks remaining = {0}".format(task._number_left*task._chunksize))
time.sleep(self.update_interval)
def hi(x): #This must be defined before `p` if we are to use in the interpreter
time.sleep(x//2)
return x
a = list(range(50))
p = multiprocessing.Pool()
pp = PoolProgress(p)
res = p.map_async(hi,a)
pp.track(res)
The solution from Richard works well with a low number of jobs, but for some reason, it seems to freeze at a very high number of jobs, I found best to use:
import multiprocessing
import time
def track_job(job, update_interval=3):
while job._number_left > 0:
print("Tasks remaining = {0}".format(
job._number_left * job._chunksize))
time.sleep(update_interval)
def hi(x): #This must be defined before `p` if we are to use in the interpreter
time.sleep(x//2)
return x
a = [x for x in range(50)]
p = multiprocessing.Pool()
res = p.map_async(hi,a)
track_job(res)

Categories