downloadStart = datetime.now()
while (True):
requestURL = transactionAPI.format(page = tempPage,limit = 5000)
response = requests.get(requestURL,headers=headers)
json_data = json.loads(response.content)
tempMomosTransactionHistory.extend(json_data["list"])
if(datetime.fromtimestamp(json_data["list"][-1]["crtime"]) < datetime(datetime.today().year,datetime.today().month,datetime.today().day - dateRange)):
break
tempPage += 1
downloadEnd = datetime.now()
Any suggestions please threading or something like that ?
Outputs here
downloadtime 0:00:02.056010
downloadtime 0:00:05.680806
downloadtime 0:00:05.447945
You need to improve it in two ways.
Optimise code within loop
Parallelize code execution
#1
By looking at your code I can see one improvement ie. create datetime.today object instead of doing 3 times. Check other methods like transactionAPI optimise further.
#2:
If you multi core CPU machine then you take advantage of machine by spanning thread per page. Refer to modified code of above.
import threading
def processRequest(tempPage):
requestURL = transactionAPI.format(page = tempPage,limit = 5000)
response = requests.get(requestURL,headers=headers)
json_data = json.loads(response.content)
tempMomosTransactionHistory.extend(json_data["list"])
downloadStart = datetime.now()
while (True):
#create thread per page
t1 = threading.Thread(target=processRequest, args=(tempPage, ))
t1.start()
#Fetch datetime today object once instaed 3 times
datetimetoday = datetime()
if(datetime.fromtimestamp(json_data["list"][-1]["crtime"]) < datetime(datetimetoday.year,datetimetoday.month,datetimetoday.day - dateRange)):
break
tempPage += 1
downloadEnd = datetime.now()
Given a list of data to process and a 64-core CPU (plus 500 GB RAM).
The list should sort strings and store data in a result set of millions of records, which runs just fine, takes a few seconds with multiprocessing.
But I'd also need to store the result somehow, either in a txt, csv output or a database. So far I haven't found a viable solution, because after the first part (process), the insert method either gives an error with trying it with MySQL pooling, or takes an insanely long time giving the txt output.
What Ive tried so far: simple txt output, print out to txt file, using csv, pandas and numpy libs. Nothing seems to speed it up. Any help would be greatly appreciated!
My code right now:
import os
import re
import datetime
import time
import csv
import mysql.connector as connector
from mysql.connector.pooling import MySQLConnectionPool
import mysql
import numpy as np
from tqdm import tqdm
from time import sleep
import multiprocessing as mp
import numpy
pool = MySQLConnectionPool( pool_name="sql_pool",
pool_size=32,
pool_reset_session=True,
host="localhost",
port="3306",
user="homestead",
password="secret",
database="homestead")
# # sql connection
db = mysql.connector.connect(
host="localhost",
port="3306",
user="homestead",
password="secret",
database="homestead"
)
sql_cursor = db.cursor()
delete_statement = "DELETE FROM statistics"
sql_cursor.execute(delete_statement)
db.commit()
sql_statement = "INSERT INTO statistics (name, cnt) VALUES (%s, %s)"
list = []
domains = mp.Manager().list()
unique_list = mp.Manager().list()
invalid_emails = mp.Manager().list()
result = mp.Manager().list()
regex_email = '^(\w|\.|\_|\-)+[#](\w|\_|\-|\.)+[.]\w{2,3}$'
# check email validity
def check(list, email):
if(re.search(regex_email, email)):
domains.append(email.lower().split('#')[1])
return True
else:
invalid_emails.append(email)
return False
#end of check email validity
# execution time converter
def convertTime(seconds):
seconds = seconds % (24 * 3600)
hour = seconds // 3600
seconds %= 3600
minutes = seconds // 60
seconds %= 60
if(hour == 0):
if(minutes == 0):
return "{0} sec".format(seconds)
else:
return "{0}min {1}sec".format(minutes, seconds)
else:
return "{0}hr {1}min {2}sec".format(hour, minutes, seconds)
# execution time converter end
#process
def process(list):
for item in tqdm(list):
if(check(list, item)):
item = item.lower().split('#')[1]
if item not in unique_list:
unique_list.append(item)
# end of process
def insert(list):
global sql_statement
# Add to db
con = pool.get_connection()
cur = con.cursor()
print("PID %d: using connection %s" % (os.getpid(), con))
#cur.executemany(sql_statement, sorted(map(set_result, list)))
for item in list:
cur.execute(sql_statement, (item, domains.count(item)))
con.commit()
cur.close()
con.close()
# def insert_into_database(list):
#sql_cursor.execute(sql_statement, (unique_list, 1), multi=True)
# sql_cursor.executemany(sql_statement, sorted(map(set_result, list)))
# db.commit()
# statistics
def statistics(list):
for item in tqdm(list):
if(domains.count(item) > 0):
result.append([domains.count(item), item])
# end of statistics
params = sys.argv
filename = ''
process_count = -1
for i, item in enumerate(params):
if(item.endswith('.txt')):
filename = item
if(item == '--top'):
process_count = int(params[i+1])
def set_result(item):
return item, domains.count(item)
# main
if(filename):
try:
start_time = time.time()
now = datetime.datetime.now()
dirname = "email_stats_{0}".format(now.strftime("%Y%m%d_%H%M%S"))
os.mkdir(dirname)
list = open(filename).read().split()
if(process_count == -1):
process_count = len(list)
if(process_count > 0):
list = list[:process_count]
#chunking list
n = int(len(list) / mp.cpu_count())
chunks = [list[i:i + n] for i in range(0, len(list), n)]
processes = []
print('Processing list on {0} cores...'.format(mp.cpu_count()))
for chunk in chunks:
p = mp.Process(target=process, args=[chunk])
p.start()
processes.append(p)
for p in processes:
p.join()
# insert(unique_list)
## step 2 - write sql
## Clearing out db before new data insert
con = pool.get_connection()
cur = con.cursor()
delete_statement = "DELETE FROM statistics"
cur.execute(delete_statement)
u_processes = []
#Maximum pool size for sql is 32, so maximum chunk number should be that too.
if(mp.cpu_count() < 32):
n2 = int(len(unique_list) / mp.cpu_count())
else:
n2 = int(len(unique_list) / 32)
u_chunks = [unique_list[i:i + n2] for i in range(0, len(unique_list), n2)]
for u_chunk in u_chunks:
p = mp.Process(target=insert, args=[u_chunk])
p.start()
u_processes.append(p)
for p in u_processes:
p.join()
for p in u_processes:
p.close()
# sql_cursor.executemany(sql_statement, sorted(map(set_result, unique_list)))
# db.commit()
# for item in tqdm(unique_list):
# sql_val = (item, domains.count(item))
# sql_cursor.execute(sql_statement, sql_val)
#
# db.commit()
## numpy.savetxt('saved.txt', sorted(map(set_result, unique_list)), fmt='%s')
# with(mp.Pool(mp.cpu_count(), initializer = db) as Pool:
# Pool.map_async(insert_into_database(),set(unique_list))
# Pool.close()
# Pool.join()
print('Creating statistics for {0} individual domains...'.format(len(unique_list)))
# unique_list = set(unique_list)
# with open("{0}/result.txt".format(dirname), "w+") as f:
# csv.writer(f).writerows(sorted(map(set_result, unique_list), reverse=True))
print('Writing final statistics...')
print('OK.')
f = open("{0}/stat.txt".format(dirname),"w+")
f.write("Number of processed emails: {0}\r\n".format(process_count))
f.write("Number of valid emails: {0}\r\n".format(len(list) - len(invalid_emails)))
f.write("Number of invalid emails: {0}\r\n".format(len(invalid_emails)))
f.write("Execution time: {0}".format(convertTime(int(time.time() - start_time))))
f.close()
except FileNotFoundError:
print('File not found, path or file broken.')
else:
print('Wrong file format, should be a txt file.')
# main
See my comments regarding some changes you might wish to make, one of which might improve performance. But I think one area of performance which could really be improved is in your use of managed lists. These are represented by proxies and each operation on such a list is essentially a remote procedure call and thus very slow. You cannot avoid this given that you need to have multiple processes updating a common, shared lists (or dict if you take my suggestion). But in the main process you might be trying, for example, to construct a set from a shared list as follows:
Pool.map_async(insert_into_database(),set(unique_list))
(by the way, that should be Pool.map(insert_into_database, set(unique_list)), i.e. you have an extra set of () and you can then get rid of the calls to pool.close() and pool.join() if you wish)
The problem is that you are iterating every element of unique_list through a proxy, which might be what is taking a very long time. I say "might" because I would think the use of managed lists would prevent the code as is, i.e. without outputting the results, from completing in "a few seconds" if we are talking about "millions" of records and thus millions of remote procedure calls. But this number could certainly be reduced if you could somehow get the underlying list as a native list.
First, you need to heed my comment about having declared a variable named list thus making it impossible to create native lists or subclasses of list. Once your have renamed that variable to something more reasonable, we can create our own managed class MyList that will expose the underlying list on which it is built. Note that you can do the same thing with a MyDict class that subclasses dict. I have defined both classes for you. Here is a benchmark showing the difference between constructing a native list from a managed list versus creating a native list from a MyList:
import multiprocessing as mp
from multiprocessing.managers import BaseManager
import time
class MyManager(BaseManager):
pass
class MyList(list):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_underlying_list(self):
return self
class MyDict(dict):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_underlying_dict(self):
return self
# required for windows, which I am running on:
if __name__ == '__main__':
l = mp.Manager().list()
for i in range(100_000):
l.append(i)
t = time.time()
l2 = list(l)
print(time.time() - t, l2[0:5], l2[-5:])
MyManager.register('MyList', MyList)
MyManager.register('MyDict', MyDict)
my_manager = MyManager()
# must explicitly start the manager or use: with MyManager() as manager:
my_manager.start()
l = my_manager.MyList()
for i in range(100_000):
l.append(i)
t = time.time()
l2 = list(l.get_underlying_list())
print(time.time() - t, l2[0:5], l2[-5:])
Prints:
7.3949973583221436 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]
0.007997751235961914 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]
I'm with a problem using multiprocess with Python. I have two codes. The first is working well, but it's outside of a class and I need to put it inside of a class because this class is part of a bigger program. When I do it the code takes 250 seconds to run instead of 10 when is
The working code (without class) is:
import time
nlp =spacy.load("en_core_web_md")
start_time = time.time()
doc1 = nlp(str("Data Scientist"))
def get_paralel_similarity(item):
return doc1.similarity(nlp(item))
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
similarities = pool.map(get_paralel_similarity, list(df["jobs"]))
print("--- %s seconds ---" % (time.time() - start_time))
--- 10.971235990524292 seconds ---
You can see that it took less than 11 seconds to run. Without multiprocessing, the same process was taking 1 minute.
The problem is, doc1 is dynamic and I need to run this code a large number of times. In that sense I need to put it in a class. The code I did with this objective is:
import time
import spacy
import warnings
import operator
from multiprocessing import Pool, set_start_method
from functools import partial
from tqdm import tqdm
warnings.filterwarnings("ignore")
nlp =spacy.load("en_core_web_md")
def get_paralel_similarity(tup):
#doc1 = tup[0]
return tup[0].similarity(nlp(tup[1]))
class Matcher(object):
def __init__(self,**kwargs):
self.word = kwargs.get('word')
self.word_list = kwargs.get('word_list')
self.n = kwargs.get('n')
self.nlp = kwargs.get('nlp')
def get_top_similarities(self):
pool = Pool()
similarities = {}
doc1 = nlp(str(self.word))
tup_list = []
for i in tqdm(self.word_list):
tup_list.append((doc1,i))
start_time = time.time()
similarities = pool.map(get_paralel_similarity, tup_list)
pool.close()
#pool.join()
print("--- %s seconds ---" % (time.time() - start_time))
simi = {}
for i in tqd(range(len(self.word_list))):
simi[self.word_list[i]] = similarities[i]
return sorted(simi.items(),key=operator.itemgetter(1),reverse=True)[:self.n]
When I do:
import pandas as pd
df = pd.read_pickle("complete.pkl")
matcher = Matcher(word="Data Scientist",word_list=list(df["jobs"]),n=5,nlp=nlp)
similarity = matcher.get_top_similarities()
--- 256.1134169101715 seconds ---
It's taking ~250 seconds. I will appreciate it if you please help me understand what is wrong?
I use a script to parce some sites and get news from there.
Each function in this script parse one site and return list of articles and then I want to combine them all in one big list.
If I parce site by site it takes to long and I desided to use multithreading.
I found a sample like this one in the bottom, but it seems not pithonic for me.
If I will add one more function to parse one more site, I will need to add each time the same block of code:
qN = Queue()
Thread(target=wrapper, args=(last_news_from_bar, qN)).start()
news_from_N = qN.get()
for new in news_from_N:
news.append(new)
Is there another solution to do this kind of stuff?
#!/usr/bin/python
# -*- coding: utf-8 -*-
from queue import Queue
from threading import Thread
def wrapper(func, queue):
queue.put(func())
def last_news_from_bar():
...
return list_of_articles #[['title1', 'http://someurl1', '2017-09-13'],['title2', 'http://someurl2', '2017-09-13']]
def last_news_from_foo():
...
return list_of_articles
q1, q2 = Queue(), Queue()
Thread(target=wrapper, args=(last_news_from_bar, q1)).start()
Thread(target=wrapper, args=(last_news_from_foo, q2)).start()
news_from_bar = q1.get()
news_from_foo = q2.get()
all_news = []
for new in news_from_bar:
news.append(new)
for new in news_from_foo:
news.append(new)
print(all_news)
Solution without Queue:
NEWS = []
LOCK = Lock()
def gather_news(url):
while True:
news = news_from(url)
if not news: break
with LOCK:
NEWS.append(news)
if __name__ == '__main__':
T = []
for url in ['url1', 'url2', 'url3']:
t = Thread(target=gather_news, args=(url,))
t.start()
T.append(t)
# Wait until all Threads done
for t in T:
t.join()
print(NEWS)
All, that you should do, is using a single queue and extend your result array:
q1 = Queue()
Thread(target=wrapper, args=(last_news_from_bar, q1)).start()
Thread(target=wrapper, args=(last_news_from_foo, q1)).start()
all_news = []
all_news.extend(q1.get())
all_news.extend(q1.get())
print(all_news)
I am trying to create a thread function that allow me to check a database field in order to see if the time.now() is bigger than the one recorded in the database(postgresql); the problem is that the view.py where I am calling this, is blocked by this thread, this is my actual code:
PD: expire_pet is a text field, then I cast it to datetime.
import socket
import struct
from time import *
from datetime import datetime
from models import Zone
from multiprocessing import pool
import threading
class ControlHora(threading.Thread):
def __init__(self,zone_id):
threading.Thread.__init__(self)
self.zone_id = zone_id
def run(self):
while(True):
zone_pet = Zone.objects.filter(id = self.zone_id)
for i in zone_pet:
if i.pet_state == True:
hour = datetime.datetime.strptime(i.expire_pet, '%I:%M')
if hour <= datetime.datetime.now():
Zone.objects.filter(id = self.zone_id).update(vitrasa_pet = False)
Zone.objects.filter(id = self.zone_id).update(esycsa_pet = False)
Zone.objects.filter(id = self.zone_id).update(pet_state = False)
Zone.objects.filter(id = self.zone_id).update(expire_pet='')
sleep(5)
It works, the problem was that I have been calling the run in the wrong place, thanks