Python Multiproccessing Pool - Sharing one variable per process?

Python Multiproccessing Pool - Sharing one variable per process? - python

I have been trying to find a simple example where I share one constant variable per process launched in my process pool. Most examples show you how to share variables across processes, which is not what I want.
import multiprocessing
import time
data = (
{"var":1, "shared": None}, {"var":2, "shared": None}, {"var":3, "shared": None}, {"var":4, "shared": None}
)
def mp_worker(input):
print input
# print " Processs %s\tWaiting %s seconds" % (inputs, the_time)
# time.sleep(int(the_time))
# print " Process %s\tDONE" % inputs
def mp_handler():
p = multiprocessing.Pool(2)
p.map(mp_worker, data)
if __name__ == '__main__':
mp_handler()
For example, if I run this code, I would like to have my "shared" component intialized once for each process.
I would like to do something like this (This doesnt work):
from multiprocessing import Pool, Process
class Worker(Process):
def __init__(self):
print 'Worker started'
# do some initialization here
super(Worker, self).__init__()
def compute(self, data):
print 'Computing things!'
return data * data
if __name__ == '__main__':
# This works fine
worker = Worker()
#print worker.compute(3)
# workers get initialized fine
pool = Pool(processes = 4,
initializer = Worker)
data = range(10)
# How to use my worker pool?
# result = pool.map(Worker.compute, data)
result = pool.map(Worker.compute, data)

Using shared c_types:
from multiprocessing import Process, Lock
from multiprocessing.sharedctypes import Value
from ctypes import Structure, c_double
class Point(Structure):
_fields_ = [('x', c_double), ('y', c_double)]
def modify(parmMap):
parmMap['point'].x = parmMap['var']
parmMap['point'].y = parmMap['var'] * 2
if __name__ == '__main__':
lock = Lock()
data = ( {'var' : 1, 'shared' : Value(Point, (0,0), lock=lock) },
{'var' : 2, 'shared' : Value(Point, (0,0), lock=lock) },
{'var' : 3, 'shared' : Value(Point, (0,0), lock=lock) },
{'var' : 4, 'shared' : Value(Point, (0,0), lock=lock) }
)
p = multiprocessing.Pool(2)
print p.map(mp_worker, data)
print data

def init(args, num_gpu):
pid = int(str(multiprocessing.current_process()).split(" ")[0].split("-")[-1].split(",")[0]) - 1
gpu_id = pid % num_gpu
global testModule
testModule = TestModuleShared(args, gpu_id)
def worker(datum):
pid = int(str(multiprocessing.current_process()).split(" ")[0].split("-")[-1].split(",")[0]) - 1
params = datum["params"]
# print str(datum["fc"]) + " " + str(pid)
# print testModule.openpose
# Reset State
testModule.run()
p = multiprocessing.Pool(per_gpu_threads*num_gpu, initializer=init, initargs=(params["test_module_param"],num_gpu,))
It turns out you can just use the global variable keyword, along with an initializer callback to initialize it.

Related

Multiprocessing gremlinpython queries

Goal:
Accelerate the random walk generation by using multiple processes.
Get the list of vertices ids from which I want random walks to be generated in an input queue
Start as much processes as possible with the correct parameters
Make them put the random walks into an output queue
Wait for completion
Read the output queue
What I am doing:
# Libraries imports
from multiprocessing import cpu_count, Process, Queue
import queue
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
# Function the processes are supposed to execute
def job(proc_id:int, siq:Queue, rwq:Queue, g:AnonymousTraversalSource, length:int):
while True:
try:
# Get next element in ids queue
start_id = siq.get_nowait()
except queue.Empty:
# If the ids queue is empty, then terminate
break
else:
# Do a random walk of length <length> from the vertex with id <start_id>
random_walk = g.V(start_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
print(f"{proc_id}: rw obtained")
# Transform the list of vertices into a comma-separated string of ids
rwq.put(",".join(
[str(v.id) for v in random_walk]
))
print(f"{proc_id}: rw handled")
if __name__ == "__main__":
# Get the parameters from the <config.ini> configuration file
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_nb_per_node = int(config["WALKS"]["NB_PER_NODE"])
walks_length = int(config["WALKS"]["LENGTH"])
# Connect to Janus Graph
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
# Instantiate the queues and populate the ids one
start_ids_queue = Queue()
random_walks_queue = Queue()
for vertex in g_main.V().has("vertex_label", "<label>").fold().next():
start_ids_queue.put(vertex.id)
# Create and start the processes
nb_processes = cpu_count()
processes = []
for i in range(nb_processes):
p = Process(target=job, args=(
i,
start_ids_queue,
random_walks_queue,
g_main,
walks_length
))
processes.append(p)
p.start()
for p in processes:
p.join()
# Once the processes are terminated, read the random walks queue
random_walks = []
while not random_walks_queue.empty():
random_walks.append(random_walks_queue.get())
# Do something with the random walks
...
Issue:
Once the processes are started, nothing seems to happen. I never get the X: rw obtained/X: rw handled messages. With a bit more logging, I can see that the queries have been sent yet isn't finishing.
In the logs, when performing the first g_main.V().has("vertex_label", "<label>").fold().next() in the main process (when I populate the ids queue), I have the following message:
DEBUG:gremlinpython:submit with bytecode '[['V'], ['has', 'vertex_label', 'movie'], ['fold']]'
DEBUG:gremlinpython:message '[['V'], ['has', 'vertex_label', '<label>'], ['fold']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V'], ['has', 'vertex_label', '<label>'], ['fold']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
When the other processes send their queries, I have similar logs:
DEBUG:gremlinpython:submit with bytecode '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:message '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
The issue seems not to reside in the query sent, but instead in the indefinite wait that ensues.
If you know of an issue with gremlinpython and multiprocessing, if there is a problem in my multi-processing code, or if you have any explanation that I may have overlooked, please explain to me! Thanks a lot to everyone reading this!
Solutions:
The first partial solution that I found is to use multi-threading instead of multiprocessing:
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
import threading
class myThread(threading.Thread):
def __init__(self, thread_id, g, length):
threading.Thread.__init__(self)
self.thread_id = thread_id
self.thread_count = 0
self.gtraversal = g
self.walk_length = length
self.nb_walks = nb_walks
def run(self):
while True:
start_ids_list_lock.acquire()
try:
start_id = start_ids_list.pop(0)
start_ids_list_lock.release()
except IndexError:
start_ids_list_lock.release()
break
else:
self.thread_count += 1
random_walk = job(
vertex_id=start_id,
g=self.gtraversal,
length=self.walk_length,
nb_walks=self.nb_walks
)
random_walks_list_lock.acquire()
random_walks_list.append(random_walk)
random_walks_list_lock.release()
logging.info(f"Thread {self.thread_id}: {self.thread_count} done")
def job(vertex_id:int, g:AnonymousTraversalSource, length:int) -> str:
random_walk = g.V(vertex_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
return ",".join(random_walk)
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_length = int(config["WALKS"]["LENGTH"])
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
threads = []
start_ids_list = []
random_walks_list = []
random_walks_list_lock = threading.Lock()
start_ids_list_lock = threading.Lock()
start_ids_list = [vertex.id for vertex in g_main.V().has("vertex_label", "<label>").fold().next()]
nb_vertices = len(start_ids_list)
nb_threads = 6
for i in range(nb_threads):
thread = myThread(
thread_id=i,
g=g_main,
length=walks_length
)
thread.start()
threads.append(thread)
for t in threads:
t.join()
# Do something with the random walks
...
This solution is effectively working and improves the execution time of the program. This isn't a full answer though, as it doesn't explain why the multiprocessing is not performing as I expected.

Python multiprocessing output result

Given a list of data to process and a 64-core CPU (plus 500 GB RAM).
The list should sort strings and store data in a result set of millions of records, which runs just fine, takes a few seconds with multiprocessing.
But I'd also need to store the result somehow, either in a txt, csv output or a database. So far I haven't found a viable solution, because after the first part (process), the insert method either gives an error with trying it with MySQL pooling, or takes an insanely long time giving the txt output.
What Ive tried so far: simple txt output, print out to txt file, using csv, pandas and numpy libs. Nothing seems to speed it up. Any help would be greatly appreciated!
My code right now:
import os
import re
import datetime
import time
import csv
import mysql.connector as connector
from mysql.connector.pooling import MySQLConnectionPool
import mysql
import numpy as np
from tqdm import tqdm
from time import sleep
import multiprocessing as mp
import numpy
pool = MySQLConnectionPool( pool_name="sql_pool",
pool_size=32,
pool_reset_session=True,
host="localhost",
port="3306",
user="homestead",
password="secret",
database="homestead")
# # sql connection
db = mysql.connector.connect(
host="localhost",
port="3306",
user="homestead",
password="secret",
database="homestead"
)
sql_cursor = db.cursor()
delete_statement = "DELETE FROM statistics"
sql_cursor.execute(delete_statement)
db.commit()
sql_statement = "INSERT INTO statistics (name, cnt) VALUES (%s, %s)"
list = []
domains = mp.Manager().list()
unique_list = mp.Manager().list()
invalid_emails = mp.Manager().list()
result = mp.Manager().list()
regex_email = '^(\w|\.|\_|\-)+[#](\w|\_|\-|\.)+[.]\w{2,3}$'
# check email validity
def check(list, email):
if(re.search(regex_email, email)):
domains.append(email.lower().split('#')[1])
return True
else:
invalid_emails.append(email)
return False
#end of check email validity
# execution time converter
def convertTime(seconds):
seconds = seconds % (24 * 3600)
hour = seconds // 3600
seconds %= 3600
minutes = seconds // 60
seconds %= 60
if(hour == 0):
if(minutes == 0):
return "{0} sec".format(seconds)
else:
return "{0}min {1}sec".format(minutes, seconds)
else:
return "{0}hr {1}min {2}sec".format(hour, minutes, seconds)
# execution time converter end
#process
def process(list):
for item in tqdm(list):
if(check(list, item)):
item = item.lower().split('#')[1]
if item not in unique_list:
unique_list.append(item)
# end of process
def insert(list):
global sql_statement
# Add to db
con = pool.get_connection()
cur = con.cursor()
print("PID %d: using connection %s" % (os.getpid(), con))
#cur.executemany(sql_statement, sorted(map(set_result, list)))
for item in list:
cur.execute(sql_statement, (item, domains.count(item)))
con.commit()
cur.close()
con.close()
# def insert_into_database(list):
#sql_cursor.execute(sql_statement, (unique_list, 1), multi=True)
# sql_cursor.executemany(sql_statement, sorted(map(set_result, list)))
# db.commit()
# statistics
def statistics(list):
for item in tqdm(list):
if(domains.count(item) > 0):
result.append([domains.count(item), item])
# end of statistics
params = sys.argv
filename = ''
process_count = -1
for i, item in enumerate(params):
if(item.endswith('.txt')):
filename = item
if(item == '--top'):
process_count = int(params[i+1])
def set_result(item):
return item, domains.count(item)
# main
if(filename):
try:
start_time = time.time()
now = datetime.datetime.now()
dirname = "email_stats_{0}".format(now.strftime("%Y%m%d_%H%M%S"))
os.mkdir(dirname)
list = open(filename).read().split()
if(process_count == -1):
process_count = len(list)
if(process_count > 0):
list = list[:process_count]
#chunking list
n = int(len(list) / mp.cpu_count())
chunks = [list[i:i + n] for i in range(0, len(list), n)]
processes = []
print('Processing list on {0} cores...'.format(mp.cpu_count()))
for chunk in chunks:
p = mp.Process(target=process, args=[chunk])
p.start()
processes.append(p)
for p in processes:
p.join()
# insert(unique_list)
## step 2 - write sql
## Clearing out db before new data insert
con = pool.get_connection()
cur = con.cursor()
delete_statement = "DELETE FROM statistics"
cur.execute(delete_statement)
u_processes = []
#Maximum pool size for sql is 32, so maximum chunk number should be that too.
if(mp.cpu_count() < 32):
n2 = int(len(unique_list) / mp.cpu_count())
else:
n2 = int(len(unique_list) / 32)
u_chunks = [unique_list[i:i + n2] for i in range(0, len(unique_list), n2)]
for u_chunk in u_chunks:
p = mp.Process(target=insert, args=[u_chunk])
p.start()
u_processes.append(p)
for p in u_processes:
p.join()
for p in u_processes:
p.close()
# sql_cursor.executemany(sql_statement, sorted(map(set_result, unique_list)))
# db.commit()
# for item in tqdm(unique_list):
# sql_val = (item, domains.count(item))
# sql_cursor.execute(sql_statement, sql_val)
#
# db.commit()
## numpy.savetxt('saved.txt', sorted(map(set_result, unique_list)), fmt='%s')
# with(mp.Pool(mp.cpu_count(), initializer = db) as Pool:
# Pool.map_async(insert_into_database(),set(unique_list))
# Pool.close()
# Pool.join()
print('Creating statistics for {0} individual domains...'.format(len(unique_list)))
# unique_list = set(unique_list)
# with open("{0}/result.txt".format(dirname), "w+") as f:
# csv.writer(f).writerows(sorted(map(set_result, unique_list), reverse=True))
print('Writing final statistics...')
print('OK.')
f = open("{0}/stat.txt".format(dirname),"w+")
f.write("Number of processed emails: {0}\r\n".format(process_count))
f.write("Number of valid emails: {0}\r\n".format(len(list) - len(invalid_emails)))
f.write("Number of invalid emails: {0}\r\n".format(len(invalid_emails)))
f.write("Execution time: {0}".format(convertTime(int(time.time() - start_time))))
f.close()
except FileNotFoundError:
print('File not found, path or file broken.')
else:
print('Wrong file format, should be a txt file.')
# main

See my comments regarding some changes you might wish to make, one of which might improve performance. But I think one area of performance which could really be improved is in your use of managed lists. These are represented by proxies and each operation on such a list is essentially a remote procedure call and thus very slow. You cannot avoid this given that you need to have multiple processes updating a common, shared lists (or dict if you take my suggestion). But in the main process you might be trying, for example, to construct a set from a shared list as follows:
Pool.map_async(insert_into_database(),set(unique_list))
(by the way, that should be Pool.map(insert_into_database, set(unique_list)), i.e. you have an extra set of () and you can then get rid of the calls to pool.close() and pool.join() if you wish)
The problem is that you are iterating every element of unique_list through a proxy, which might be what is taking a very long time. I say "might" because I would think the use of managed lists would prevent the code as is, i.e. without outputting the results, from completing in "a few seconds" if we are talking about "millions" of records and thus millions of remote procedure calls. But this number could certainly be reduced if you could somehow get the underlying list as a native list.
First, you need to heed my comment about having declared a variable named list thus making it impossible to create native lists or subclasses of list. Once your have renamed that variable to something more reasonable, we can create our own managed class MyList that will expose the underlying list on which it is built. Note that you can do the same thing with a MyDict class that subclasses dict. I have defined both classes for you. Here is a benchmark showing the difference between constructing a native list from a managed list versus creating a native list from a MyList:
import multiprocessing as mp
from multiprocessing.managers import BaseManager
import time
class MyManager(BaseManager):
pass
class MyList(list):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_underlying_list(self):
return self
class MyDict(dict):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_underlying_dict(self):
return self
# required for windows, which I am running on:
if __name__ == '__main__':
l = mp.Manager().list()
for i in range(100_000):
l.append(i)
t = time.time()
l2 = list(l)
print(time.time() - t, l2[0:5], l2[-5:])
MyManager.register('MyList', MyList)
MyManager.register('MyDict', MyDict)
my_manager = MyManager()
# must explicitly start the manager or use: with MyManager() as manager:
my_manager.start()
l = my_manager.MyList()
for i in range(100_000):
l.append(i)
t = time.time()
l2 = list(l.get_underlying_list())
print(time.time() - t, l2[0:5], l2[-5:])
Prints:
7.3949973583221436 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]
0.007997751235961914 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]

I can not delete print job with python win32print

I use python3 with win32print like this code. It should delete all print job because TotalPages >= 1 . I think all datatype are correct.
import time
import win32print
#----------------------------------------------------------------------
def print_job_checker():
"""
Prints out all jobs in the print queue every 5 seconds
"""
jobs = [1]
while jobs:
jobs = []
for p in win32print.EnumPrinters(win32print.PRINTER_ENUM_LOCAL,
None, 1):
flags, desc, name, comment = p
phandle = win32print.OpenPrinter(name)
print_jobs = win32print.EnumJobs(phandle, 0, -1, 1)
if print_jobs:
jobs.extend(list(print_jobs))
for job in print_jobs:
print(job['TotalPages'])
if(job['TotalPages'] >= 1):
print(type(job))
win32print.SetJob(phandle, job['JobId'], 1, job, win32print.JOB_CONTROL_DELETE)
win32print.ClosePrinter(phandle)
time.sleep(0.25)
print ("No more jobs!")
#----------------------------------------------------------------------
if __name__ == "__main__":
while True:
print_job_checker()
when I run it show error like this.
File "C:/Users/test_printer.py", line 37, in print_job_checker
win32print.SetJob(phandle, job['JobId'], 1, job, win32print.JOB_CONTROL_DELETE)
error: (1804, 'SetJob', 'The specified datatype is invalid.')

Since you are deleting and not changing job attributes, try this;
win32print.SetJob(phandle, job['JobId'], 0, None, win32print.JOB_CONTROL_DELETE)
The following parameters has changed:
Level = 0
pPrinter = None

Python Multi-threading in a recordset

I have a database record set (approx. 1000 rows) and I am currently iterating through them, to integrate more data using extra db query for each record.
Doing that, raises the overall process time to maybe 100 seconds.
What I want to do is share the functionality to 2-4 processes.
I am using Python 2.7 to have AWS Lambda compatibility.
def handler(event, context):
try:
records = connection.get_users()
mandrill_client = open_mandrill_connection()
mandrill_messages = get_mandrill_messages()
mandrill_template = 'POINTS weekly-report-to-user'
start_time = time.time()
messages = build_messages(mandrill_messages, records)
print("OVERALL: %s seconds ---" % (time.time() - start_time))
send_mandrill_message(mandrill_client, mandrill_template, messages)
connection.close_database_connection()
return "Process Completed"
except Exception as e:
print(e)
Following is the function which I want to put into threads:
def build_messages(messages, records):
for record in records:
record = dict(record)
stream = get_user_stream(record)
data = compile_loyalty_stream(stream)
messages['to'].append({
'email': record['email'],
'type': 'to'
})
messages['merge_vars'].append({
'rcpt': record['email'],
'vars': [
{
'name': 'total_points',
'content': record['total_points']
},
{
'name': 'total_week',
'content': record['week_points']
},
{
'name': 'stream_greek',
'content': data['el']
},
{
'name': 'stream_english',
'content': data['en']
}
]
})
return messages
What I have tried is importing the multiprocessing library:
from multiprocessing.pool import ThreadPool
Created a pool inside the try block and mapped the function inside this pool:
pool = ThreadPool(4)
messages = pool.map(build_messages_in, itertools.izip(itertools.repeat(mandrill_messages), records))
def build_messages_in(a_b):
build_msg(*a_b)
def build_msg(a, b):
return build_messages(a, b)
def get_user_stream(record):
response = []
i = 0
for mod, mod_id, act, p, act_created in izip(record['models'], record['model_ids'], record['actions'],
record['points'], record['action_creation']):
information = get_reference(mod, mod_id)
if information:
response.append({
'action': act,
'points': p,
'created': act_created,
'info': information
})
if (act == 'invite_friend') \
or (act == 'donate') \
or (act == 'bonus_500_general') \
or (act == 'bonus_1000_general') \
or (act == 'bonus_500_cancel') \
or (act == 'bonus_1000_cancel'):
response[i]['info']['date_ref'] = act_created
response[i]['info']['slug'] = 'attiki'
if (act == 'bonus_500_general') \
or (act == 'bonus_1000_general') \
or (act == 'bonus_500_cancel') \
or (act == 'bonus_1000_cancel'):
response[i]['info']['title'] = ''
i += 1
return response
Finally I removed the for loop from the build_message function.
What I get as a results is a 'NoneType' object is not iterable.
Is this the correct way of doing this?

Your code seems pretty in-depth and so you cannot be sure that multithreading will lead to any performance gains when applied on a high level. Therefore, it's worth digging down to the point that gives you the largest latency and considering how to approach the specific bottleneck. See here for greater discussion on threading limitations.
If, for example as we discussed in comments, you can pinpoint a single task that is taking a long time, then you could try to parallelize it using multiprocessing instead - to leverage more of your CPU power. Here is a generic example that hopefully is simple enough to understand to mirror your Postgres queries without going into your own code base; I think that's an unfeasible amount of effort tbh.
import multiprocessing as mp
import time
import random
import datetime as dt
MAILCHIMP_RESPONSE = [x for x in range(1000)]
def chunks(l, n):
n = max(1, n)
return [l[i:i + n] for i in range(0, len(l), n)]
def db_query():
''' Delayed response from database '''
time.sleep(0.01)
return random.random()
def do_queries(query_list):
''' The function that takes all your query ids and executes them
sequentially for each id '''
results = []
for item in query_list:
query = db_query()
# Your super-quick processing of the Postgres response
processing_result = query * 2
results.append([item, processing_result])
return results
def single_processing():
''' As you do now - equivalent to get_reference '''
result_of_process = do_queries(MAILCHIMP_RESPONSE)
return result_of_process
def multi_process(chunked_data, queue):
''' Same as single_processing, except we put our results in queue rather
than returning them '''
result_of_process = do_queries(chunked_data)
queue.put(result_of_process)
def multiprocess_handler():
''' Divide and conquor on our db requests. We split the mailchimp response
into a series of chunks and fire our queries simultaneously. Thus, each
concurrent process has a smaller number of queries to make '''
num_processes = 4 # depending on cores/resources
size_chunk = len(MAILCHIMP_RESPONSE) / num_processes
chunked_queries = chunks(MAILCHIMP_RESPONSE, size_chunk)
queue = mp.Queue() # This is going to combine all the results
processes = [mp.Process(target=multi_process,
args=(chunked_queries[x], queue)) for x in range(num_processes)]
for p in processes: p.start()
divide_and_conquor_result = []
for p in processes:
divide_and_conquor_result.extend(queue.get())
return divide_and_conquor_result
if __name__ == '__main__':
start_single = dt.datetime.now()
single_process = single_processing()
print "Single process took {}".format(dt.datetime.now() - start_single)
print "Number of records processed = {}".format(len(single_process))
start_multi = dt.datetime.now()
multi = multiprocess_handler()
print "Multi process took {}".format(dt.datetime.now() - start_multi)
print "Number of records processed = {}".format(len(multi))

Datamining Multithreading vs Multiprocessing

I wrote and rewrote my little python application to a point where my current python skills aren't enough. I started with a single threaded application with Beautiful Soup as the parser, changed to lxml. Made the script multi-threaded, i discovered twisted but couldn't change this little snippet to twisted. I will just post this here so maybe you guys can point me to better directions to make this maybe a bit faster. To fetch 150k pages i need like 1 hour at this point. Iam happy with this cause i was 3x slower when i had my first attempt to write it.
#! /usr/bin/python
# coding: ISO-8859-1
import time, PySQLPool, Queue, threading
from urllib3 import connection_from_url
from lxml import etree
import cStringIO as StringIO
headers = {
'User-Agent' : 'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language' : 'en-us;q=0.5,en;q=0.3',
'Accept-Encoding' : 'gzip, deflate',
'Accept-Charset' : 'utf-8;q=0.7,*;q=0.7'
}
t = time.time()
PySQLPool.getNewPool().maxActiveConnections = 60
db = PySQLPool.getNewConnection(username='user', password='pass', host='127.0.0.1', db='fddb')
pool = connection_from_url('http://fddb.info/', maxsize=60, timeout=150, headers=headers)
detailCounter = 0
urls = {}
queue = Queue.Queue()
out_queue = Queue.Queue()
clean_rows = {
"Brennwert":"details_brennwert",
"Kalorien":"details_kalorien",
"Protein":"details_protein",
"Kohlenhydrate":"details_kohlenhydrate",
"davon Zucker":"details_zucker",
"davon Polyole":"details_polyole",
"Fett":"details_fett",
"Ballaststoffe":"details_ballaststoffe",
"Broteinheiten":"details_broteinheit",
"Alkohol":"details_alkohol",
"Cholesterin":"details_cholesterin",
"Koffein":"details_koffein",
"Wassergehalt":"details_wasser",
"Vitamin C":"details_vitc",
"Vitamin A":"details_vita",
"Vitamin D":"details_vitd",
"Vitamin E":"details_vite",
"Vitamin B1":"details_vitb1",
"Vitamin B2":"details_vitb2",
"Vitamin B6":"details_vitb6",
"Vitamin B12":"details_vitb12",
"Natrium":"details_natrium",
"Eisen":"details_eisen",
"Zink":"details_zink",
"Magnesium":"details_magnesium",
"Chlor":"details_chlor",
"Mangan":"details_mangan",
"Schwefel":"details_schwefel",
"Kalium":"details_kalium",
"Kalzium":"details_kalzium",
"Phosphor":"details_phosphor",
"Kupfer":"details_kupfer",
"Fluor":"details_fluor"
}
def rows_escape(text):
for item, key in clean_rows.items():
text = text.replace(item, key)
text = text.rstrip()
return text
clean_values = {
"kJ" :"",
"kcal" :"",
"g" :"",
"mg" :"",
"%" :"",
"," :".",
u"\u03bc": ""
}
def values_escape(text):
for item, key in clean_values.items():
text = text.replace(item, key)
text = text.rstrip()
return text
def insertDetails(container, foods_id):
c = PySQLPool.getNewQuery(db)
query_rows = ''
query_values = ''
for item in container:
query_rows += item['row'] + ','
query_values += item['value'] + ','
c.Query("INSERT INTO details (%sdetails_id,foods_id) VALUES (%sNULL,%s)" % (query_rows, query_values, foods_id))
c.Query("UPDATE foods SET foods_check = '1' WHERE foods_id=%d" % (foods_id))
def getHP(url):
r = pool.request('GET', '/' + url)
return r.data
class ThreadUrl(threading.Thread):
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
host = self.queue.get()
data = getHP(host[0])
self.out_queue.put([data, host[1]])
self.queue.task_done()
class DatamineThread(threading.Thread):
def __init__(self, out_queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
def run(self):
while True:
global detailCounter
qData = self.out_queue.get()
data = qData[0]
foods_id = qData[1]
container = []
parser = etree.HTMLParser(encoding='cp1252')
tree = etree.parse(StringIO.StringIO(data), parser)
divx = tree.xpath('//div[#style="background-color:#f0f5f9;padding:2px 4px;" or #style="padding:2px 4px;"]')
for xdiv in divx:
x = etree.ElementTree(element=xdiv, parser=parser)
value = x.xpath('string(//div/text())')
label = x.xpath('string(//*[self::a or self::span]/text())')
label = rows_escape(label)
if not "[nodata]" in value:
if u"\u03bc" in value:
value = values_escape(value)
item4 = 0
item4 = float(value)
item4 = item4 / 1000
container.append({'row':label,'value':str(item4)})
else:
container.append({'row':label,'value':values_escape(value)})
detailCounter += 1
container = tuple(container)
insertDetails(container, foods_id)
self.out_queue.task_done()
def main():
c = PySQLPool.getNewQuery(db)
c.Query("SELECT foods_id, foods_url FROM foods WHERE foods_check = 0")
urls = c.record
for i in range(6):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
for item in urls:
queue.put([item['foods_url'], item['foods_id']])
for i in range(6):
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
queue.join()
out_queue.join()
main()
db.close
print "Zeit: %.2f New Details: %d" % (time.time()-t, detailCounter)

I suggest you use the multiprocessing module if you have multiple CPUs AND if your program seems to be very CPU intensive. Python is notoriously bad with multithreading because of the Global Interpreter Lock, or GIL which basically ensures that on any given time, there can be only 1 python thread of execution in a single process.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Multiproccessing Pool - Sharing one variable per process? - python

Related

Multiprocessing gremlinpython queries

Python multiprocessing output result

I can not delete print job with python win32print

Python Multi-threading in a recordset

Datamining Multithreading vs Multiprocessing

Categories

Resources