Datamining Multithreading vs Multiprocessing - python

I wrote and rewrote my little python application to a point where my current python skills aren't enough. I started with a single threaded application with Beautiful Soup as the parser, changed to lxml. Made the script multi-threaded, i discovered twisted but couldn't change this little snippet to twisted. I will just post this here so maybe you guys can point me to better directions to make this maybe a bit faster. To fetch 150k pages i need like 1 hour at this point. Iam happy with this cause i was 3x slower when i had my first attempt to write it.
#! /usr/bin/python
# coding: ISO-8859-1
import time, PySQLPool, Queue, threading
from urllib3 import connection_from_url
from lxml import etree
import cStringIO as StringIO
headers = {
'User-Agent' : 'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language' : 'en-us;q=0.5,en;q=0.3',
'Accept-Encoding' : 'gzip, deflate',
'Accept-Charset' : 'utf-8;q=0.7,*;q=0.7'
}
t = time.time()
PySQLPool.getNewPool().maxActiveConnections = 60
db = PySQLPool.getNewConnection(username='user', password='pass', host='127.0.0.1', db='fddb')
pool = connection_from_url('http://fddb.info/', maxsize=60, timeout=150, headers=headers)
detailCounter = 0
urls = {}
queue = Queue.Queue()
out_queue = Queue.Queue()
clean_rows = {
"Brennwert":"details_brennwert",
"Kalorien":"details_kalorien",
"Protein":"details_protein",
"Kohlenhydrate":"details_kohlenhydrate",
"davon Zucker":"details_zucker",
"davon Polyole":"details_polyole",
"Fett":"details_fett",
"Ballaststoffe":"details_ballaststoffe",
"Broteinheiten":"details_broteinheit",
"Alkohol":"details_alkohol",
"Cholesterin":"details_cholesterin",
"Koffein":"details_koffein",
"Wassergehalt":"details_wasser",
"Vitamin C":"details_vitc",
"Vitamin A":"details_vita",
"Vitamin D":"details_vitd",
"Vitamin E":"details_vite",
"Vitamin B1":"details_vitb1",
"Vitamin B2":"details_vitb2",
"Vitamin B6":"details_vitb6",
"Vitamin B12":"details_vitb12",
"Natrium":"details_natrium",
"Eisen":"details_eisen",
"Zink":"details_zink",
"Magnesium":"details_magnesium",
"Chlor":"details_chlor",
"Mangan":"details_mangan",
"Schwefel":"details_schwefel",
"Kalium":"details_kalium",
"Kalzium":"details_kalzium",
"Phosphor":"details_phosphor",
"Kupfer":"details_kupfer",
"Fluor":"details_fluor"
}
def rows_escape(text):
for item, key in clean_rows.items():
text = text.replace(item, key)
text = text.rstrip()
return text
clean_values = {
"kJ" :"",
"kcal" :"",
"g" :"",
"mg" :"",
"%" :"",
"," :".",
u"\u03bc": ""
}
def values_escape(text):
for item, key in clean_values.items():
text = text.replace(item, key)
text = text.rstrip()
return text
def insertDetails(container, foods_id):
c = PySQLPool.getNewQuery(db)
query_rows = ''
query_values = ''
for item in container:
query_rows += item['row'] + ','
query_values += item['value'] + ','
c.Query("INSERT INTO details (%sdetails_id,foods_id) VALUES (%sNULL,%s)" % (query_rows, query_values, foods_id))
c.Query("UPDATE foods SET foods_check = '1' WHERE foods_id=%d" % (foods_id))
def getHP(url):
r = pool.request('GET', '/' + url)
return r.data
class ThreadUrl(threading.Thread):
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
host = self.queue.get()
data = getHP(host[0])
self.out_queue.put([data, host[1]])
self.queue.task_done()
class DatamineThread(threading.Thread):
def __init__(self, out_queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
def run(self):
while True:
global detailCounter
qData = self.out_queue.get()
data = qData[0]
foods_id = qData[1]
container = []
parser = etree.HTMLParser(encoding='cp1252')
tree = etree.parse(StringIO.StringIO(data), parser)
divx = tree.xpath('//div[#style="background-color:#f0f5f9;padding:2px 4px;" or #style="padding:2px 4px;"]')
for xdiv in divx:
x = etree.ElementTree(element=xdiv, parser=parser)
value = x.xpath('string(//div/text())')
label = x.xpath('string(//*[self::a or self::span]/text())')
label = rows_escape(label)
if not "[nodata]" in value:
if u"\u03bc" in value:
value = values_escape(value)
item4 = 0
item4 = float(value)
item4 = item4 / 1000
container.append({'row':label,'value':str(item4)})
else:
container.append({'row':label,'value':values_escape(value)})
detailCounter += 1
container = tuple(container)
insertDetails(container, foods_id)
self.out_queue.task_done()
def main():
c = PySQLPool.getNewQuery(db)
c.Query("SELECT foods_id, foods_url FROM foods WHERE foods_check = 0")
urls = c.record
for i in range(6):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
for item in urls:
queue.put([item['foods_url'], item['foods_id']])
for i in range(6):
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
queue.join()
out_queue.join()
main()
db.close
print "Zeit: %.2f New Details: %d" % (time.time()-t, detailCounter)

I suggest you use the multiprocessing module if you have multiple CPUs AND if your program seems to be very CPU intensive. Python is notoriously bad with multithreading because of the Global Interpreter Lock, or GIL which basically ensures that on any given time, there can be only 1 python thread of execution in a single process.

Related

Modbus TCP server with Python

I am starting a program using Python. I wanna send some informations of Python to Simply Modbus TCP Client. To be honest I really don't know to use Python, at the same time i`m learning Modbus.
The error messages I am getting is:
runfile('C:/Users/rafa_/OneDrive/Área de Trabalho/TCC_v.0/2/simhvac.py', wdir='C:/Users/rafa_/OneDrive/Área de Trabalho/TCC_v.0/2')
Erro: ("'Hvac' object has no attribute '_server'",)
My code is over there (somebody can help me):
from threading import Thread
from pyModbusTCP.server import DataBank, ModbusServer
from time import sleep
import time
class ServidorMODBUS():
""" Classe Servidor Modbus"""
def __init__(self, host_ip, port):
"""construtor"""
self._server = ModbusServer(host=host_ip, port=port, no_block=True)
self._db = DataBank
class Hvac(Thread):
def __init__(self):
Thread.__init__(self)
self.Ts = 0.5 # Sampling period in seconds
# Initial conditions
self.temperature = 25 # Initial temperature (Celsius degree)
self.co2 = 400 # Initial CO2 concentration (ppm)
self.humity = 33 # Initial Humidity (% RH)
self.number_of_occup = 1
self.co2_air_inlet = 200
self.T_air = 23
self.T_ac = 20
self.H_ac = 90
self.H_air = 90
# CO2 Parameters
self.co2_params = {
'a1': -0.8964,
'kn': 2.108,
'ki': 0.0579,
'kd': self.Ts
}
# Temperature Parameters
self.temperature_params = {
'beta_i1' : -1.179,
'beta_i2' : 0.2567,
'alfa_ac' : 0.0043,
'alfa_air' : 0.0762,
'alfa_0' : 0.0013,
'alfa_1' : -9.96E-4,
'alfa_I' : -3.06E-4
}
# Humity Parameters
self.humity_params = {
'delta_i1' : -1.6696,
'delta_i2' : 0.6844,
'gamma_ac' : -2.83E-4,
'gamma_air' : 0.0147,
'gamma_T' : 2.16E-4,
'gamma_0' : 0.0016,
'gamma_1' : 0.0018,
'gamma_I' : 4.98E-5
}
def run(self):
co2 = [self.co2, self.co2]
temp = [self.temperature, self.temperature]
humity = [self.humity, self.humity]
co2_il = [self.co2_air_inlet, self.co2_air_inlet]
"""Execuçao MODBUS"""
try:
self._server.start()
print("Servidor MODBUS online")
while True:
t = time.time()
# Process CO2, temperature, humity
(a1, kn, ki, kd) = self.co2_params.values()
n = self.number_of_occup
i = self.co2_air_inlet
d = 0 # disturbance
self.co2 = kn*n + ki*i + kd*d - a1*co2[0]
(beta_i1, beta_i2, alfa_ac, alfa_air, alfa_0, alfa_1, alfa_I) = self.temperature_params.values()
T_ac = self.T_ac
T_air = self.T_air
self.temperature = alfa_ac*T_ac+alfa_air*T_air+alfa_0*co2[0]+alfa_1*co2[1]+alfa_I*co2_il[1] - beta_i1*temp[0]-beta_i2*temp[1]
(delta_i1,delta_i2,gamma_ac,gamma_air,gamma_T,gamma_0,gamma_1,gamma_I) = self.humity_params.values()
H_ac = self.H_ac
H_air = self.H_air
self.humity = gamma_ac*H_ac+gamma_air*H_air+gamma_T*self.temperature+gamma_0*co2[0]+gamma_1*co2[1]+gamma_I*co2_il[1]-delta_i1*humity[0]-delta_i2*humity[1]
# Update delays
co2 = [self.co2, co2[0]]
temp = [self.temperature, temp[0]]
humity = [self.humity, humity[0]]
co2_il = [self.co2_air_inlet, co2_il[0]]
print('CO2:', self.co2, 'Temp:', self.temperature, 'Humidity:', self.humity)
time.sleep(self.Ts-(time.time()-t))
self._db.set_words(1000, co2, temp, humity, co2_il)
print("________________")
print("Tabela MODBUS")
print(f'Holding Register \r\n R1000: {self.db.get_words(1000)}\r\n R2000 {self.db.get_words(2000)}')
print(f'Coil \r\n R1000:{self.db.get_bits(1000)}')
sleep(1)
except Exception as e:
print("Erro: ", e.args)
The below screenshots show the error.
When posting a question on Stack overflow its good to include a minimum reproducible example because this makes it easier to for others to quickly assess your issue (and in many cases you will solve the issue yourself when creating the example). The code you pasted (including that in the screenshots) can be reduced to (try here):
from threading import Thread
class Hvac(Thread):
def run(self):
try:
self._server.start()
except Exception as e:
print("Erro: ", e.args)
HvacSim = Hvac()
HvacSim.start()
running this gives the same error you are receiving:
Erro: ("'Hvac' object has no attribute '_server'",)
You can actually simplify this further by removing the try/catch which will result in a different (and more helpful) error message (telling you the line the error occurred etc).
Its likely you are thinking "but I defined _server here":
class ServidorMODBUS():
""" Classe Servidor Modbus"""
def __init__(self, host_ip, port):
"""construtor"""
self._server = ModbusServer(host=host_ip, port=port, no_block=True)
However that is within the class ServidorMODBUS and not Hvac (you don't actually appear to be using ServidorMODBUS at all). It's not really clear what you intend to do with ServidorMODBUS (I would guess you want to create an instance of this in the the __init__ for Hvac).

Getting Pool apply_async return too slow

I'm trying to make a bot for IQ Option.
I already did it, but i did it one by one, like, i had to open 10 bots so i could check 10 pairs.
I've been trying all day long doing with ThreadPool, Threadings, map and starmap (i think i didn't use them as good as they can be).
The thing is: i'm checking pairs (EURUSD, EURAUD...) values of the last 100 minutes. When i do it one by one, it takes between 80 and 300ms to return each. I'm trying now to do this in a way that i could do like all the calls at the same time and get their results around the same time to their respective var.
Atm my code is like this:
from iqoptionapi.stable_api import IQ_Option
from functools import partial
from multiprocessing.pool import ThreadPool as Pool
from time import *
from datetime import datetime, timedelta
import os
import sys
import dados #my login data
import config #atm is just payoutMinimo = 0.79
parAtivo = {}
class PAR:
def __init__(self, par, velas):
self.par = par
self.velas = velas
self.lucro = 0
self.stoploss = 50000
self.stopgain = 50000
def verificaAbertasPayoutMinimo(API, payoutMinimo):
status = API.get_all_open_time()
profits = API.get_all_profit()
abertasPayoutMinimo = []
for x in status['turbo']:
if status['turbo'][x]['open'] and profits[x]['turbo'] >= payoutMinimo:
abertasPayoutMinimo.append(x)
return abertasPayoutMinimo
def getVelas(API, par, tempoAN, segundos, numeroVelas):
return API.get_candles(par, tempoAN*segundos, numeroVelas, time()+50)
def logVelas(velas, par):
global parAtivo
parAtivo[par] = PAR(par, velas)
def verificaVelas(API, abertasPayoutMinimo, tempoAN, segundos, numeroVelas):
pool = Pool()
global parAtivo
for par in abertasPayoutMinimo:
print(f"Verificando par {par}")
pool = Pool()
if par not in parAtivo:
callbackFunction = partial(logVelas, par=par)
pool.apply_async(
getVelas,
args=(API, par, tempoAN, segundos, numeroVelas),
callback=callbackFunction
)
pool.close()
pool.join()
def main():
tempoAN = 1
segundos = 60
numeroVelas = 20
tempoUltimaVerificacao = datetime.now() - timedelta(days=99)
global parAtivo
conectado = False
while not conectado:
API = IQ_Option(dados.user, dados.pwd)
API.connect()
if API.check_connect():
os.system("cls")
print("Conectado com sucesso.")
sleep(1)
conectado = True
else:
print("Erro ao conectar.")
sleep(1)
conectado = False
API.change_balance("PRACTICE")
while True:
if API.get_balance() < 2000:
API.reset_practice_balance()
if datetime.now() > tempoUltimaVerificacao + timedelta(minutes=5):
abertasPayoutMinimo = verificaAbertasPayoutMinimo(API, config.payoutMinimo)
tempoUltimaVerificacao = datetime.now()
verificaVelas(API, abertasPayoutMinimo, tempoAN, segundos, numeroVelas)
for item in parAtivo:
print(parAtivo[item])
break #execute only 1 time for testing
if __name__ == "__main__":
main()
#edit1: just complemented with more info, actually this is the whole code right now.
#edit2: when i print it like this:
for item in parAtivo:
print(parAtivo[item].velas[-1]['close']
I get:
0.26671
0.473878
0.923592
46.5628
1.186974
1.365679
0.86263
It's correct, the problem is it takes too long, like almost 3 seconds, the same as if i was doing without ThreadPool.
Solved.
Did it using threadings.Thread, like this:
for par in abertasPayoutMinimo:
t = threading.Thread(
target=getVelas,
args=(API, par, tempoAN, segundos)
)
t.start()
t.join()

Python proxy scraping: Max retries exceeded with url (only on Mac, Windows works fine)

Big Edit: I have found out that I am getting this error only when executing on Mac. Windows works fine, never throws exception.
I would like to get information from Discogs for personal/education use.
I am building crawler in Python (using requests lib). To speed whole process, I am using free proxy lists, and I am proxy crawling. Here I have found some free reliable proxies, which I have put in my code.
I want to visit all master pages, traversing pagination pages and every master page on it, starting from this page. This example contains 2 pages (37 masters all).
I have created Threads which take actions from action buffer, and for each action it knows how to execute it.
Helper
import queue
import random
from fake_useragent import UserAgent
proxy_list = ["45.55.27.88:8080", "162.243.107.120:3128", "67.205.146.29:3128", "104.236.238.10:3128",
"138.197.222.35:3128", "198.199.120.102:3128", "162.243.99.57:8080", "138.68.173.29:3128",
"162.243.107.43:3128", "162.243.107.43:8080", "162.243.108.129:3128", "162.243.108.161:3128",
"162.243.108.161:8080", "162.243.78.25:3128", "67.205.146.29:8080", "67.205.174.209:3128",
"138.68.165.154:3128", "138.68.169.77:3128", "138.197.58.55:3128", "138.68.169.8:8080",
"207.154.231.212:3128", "138.68.169.8:3128", "138.68.161.60:3128", "212.47.252.91:8118",
"206.246.82.2:443", "202.166.117.46:8080", "185.93.3.70:8080", "192.117.146.110:80", "151.80.58.175:80",
"139.162.235.163:31028", "103.25.138.233:8080", "163.172.173.187:3000", "113.253.113.90:80",
"113.255.76.120:80", "159.8.114.37:25", "159.8.114.37:8123", "51.255.198.111:9999", "37.59.32.112:1080",
"178.33.9.96:1080", "178.33.9.97:1080", "178.33.9.100:1080", "151.106.31.195:1080",
"134.119.205.248:1080", "134.119.205.252:1080", "134.119.205.253:1080", "37.187.149.234:1080",
"94.177.237.184:80", "178.33.9.101:1080", "134.119.184.69:1080", "134.119.184.70:1080",
"134.119.184.75:1080", "134.119.184.87:1080", "134.119.184.94:1080", "94.177.237.184:8080",
"134.119.205.243:1080", "88.190.203.36:80", "37.59.35.174:1080", "79.142.202.109:8080",
"5.196.205.139:3128", "37.59.203.129:1080", "37.59.203.133:1080", "37.59.203.135:1080",
"178.33.9.99:1080", "178.33.9.103:1080", "138.68.169.77:3128", "162.243.107.43:8080", "45.55.27.15:3128",
"104.155.75.187:8080", "142.93.51.159:80", "213.148.240.2:80", "80.211.181.37:80", "66.70.170.147:80",
"54.39.98.138:80", "204.48.22.246:80", "80.211.48.120:80", "142.93.182.13:80", "142.93.251.113:80",
"66.70.173.54:80", "142.93.49.169:80", "192.99.226.30:80", "80.211.180.201:80", "213.136.87.65:80",
"220.90.147.137:80", "68.185.57.66:80", "68.188.59.198:80", "50.203.239.19:80", "50.234.147.30:80",
"148.251.238.35:80", "98.142.36.181:80", "128.140.225.41:80", "50.203.239.21:80", "50.203.239.31:80",
"50.203.239.22:80", "75.150.88.59:80", "71.13.131.142:80", "27.255.91.146:80", "104.196.241.137:80",
"94.177.237.184:3128", "134.119.205.244:1080", "37.59.203.132:1080", "178.128.176.221:8080",
"142.93.250.239:80", "89.233.175.210:41258", "37.59.203.128:1080", "139.59.53.106:8080",
"37.187.149.129:1080", "84.115.252.221:8080", "217.23.13.52:1080", "185.2.82.23:1080",
"139.59.99.63:8080", "139.59.99.97:3128", "139.59.99.97:8080", "139.59.99.63:3128", "138.68.161.157:8080",
"138.68.161.14:8080", "138.68.161.157:3128", "204.48.22.246:8080", "5.2.137.13:3128",
"142.93.250.239:8080", "194.85.169.208:3128", "139.59.101.223:8080", "108.61.186.207:8080",
"217.61.125.74:8080", "91.89.53.235:8080", "80.211.48.120:3128", "142.93.49.169:3128",
"138.68.120.201:8080", "95.85.36.236:3128", "142.93.182.13:8080", "223.16.229.241:8080",
"142.93.58.158:8080", "142.93.247.178:3128", "217.23.10.12:1080", "217.61.125.74:3128",
"142.93.58.158:3128", "142.93.51.159:3128", "139.59.59.63:8080", "138.197.139.135:3128",
"139.59.64.9:8080", "212.237.15.108:3128", "139.59.99.113:3128", "188.226.141.61:8080",
"66.70.170.147:8080", "66.70.173.54:3128", "54.39.98.138:8799", "163.47.11.113:3128",
"139.59.101.223:3128", "138.197.157.60:3128", "138.197.157.66:3128", "207.154.231.211:3128",
"178.62.193.19:3128", "188.226.141.216:3128", "138.197.204.55:3128", "138.197.204.55:8080",
"139.59.109.156:3128", "138.197.157.45:8080", "138.197.157.44:8080", "207.154.231.209:3128",
"188.226.141.211:3128", "138.197.157.45:3128", "138.197.157.68:3128", "46.5.252.70:3128",
"139.59.99.101:3128", "188.166.216.210:3128", "138.197.157.32:3128", "207.154.231.216:3128",
"138.68.161.60:8080", "178.62.193.19:8080", "188.226.141.127:3128", "138.197.222.35:8080",
"188.226.141.217:3128", "138.197.145.103:3128", "138.197.157.32:8080", "138.197.157.60:8080",
"146.185.168.235:3128", "207.154.231.210:3128", "162.243.107.45:8080", "188.226.141.219:3128",
"88.198.24.108:3128", "138.68.230.88:3128", "45.55.27.88:3128", "139.59.99.119:3128",
"138.197.157.68:8080", "192.241.150.188:3128", "138.68.161.14:3128", "138.68.173.29:8080",
"162.243.175.141:3128", "138.197.157.44:3128", "138.68.169.77:8080", "46.4.96.137:3128",
"138.68.235.8:8080", "139.59.99.234:3128"]
random.shuffle(proxy_list)
class RequestHelper:
proxies = None
def __init__(self):
self.proxies = self._get_proxies()
def _get_proxies(self):
temp = queue.Queue()
for proxy in proxy_list:
temp.put(proxy)
return temp
def put(self, proxy):
self.proxies.put(proxy)
def get_data(self):
ip = self.proxies.get()
proxy = {'http': 'http://' + ip}
user_agent = {'user-agent': UserAgent().random}
return {'proxy': proxy, 'user-agent': user_agent, 'ip': ip}
ActionQueue
import queue
class ActionQueue:
actions = None
# action = {'url': URL, 'action': TYPE_MASTER_LIST|TYPE_MASTER_PAGE|TYPE_RELEASE_PAGE }
def __init__(self):
self.actions = queue.Queue()
def get_next(self):
try:
return self.actions.get_nowait()
except queue.Empty as e:
return None
def put(self, action):
self.actions.put(action)
Worker(Thread)
import requests
import threading
from bs4 import BeautifulSoup
from time import sleep
BASE_URL = 'https://www.discogs.com'
TYPE_MASTER_LIST = 1
TYPE_MASTER_PAGE = 2
TYPE_RELEASE_PAGE = 3
class Worker(threading.Thread):
THREAD_ID = 0
MASTERS_DONE = 0
def __init__(self, action_queue, request_helper):
super(Worker, self).__init__()
self.action_queue = action_queue
self.request_helper = request_helper
Worker.THREAD_ID += 1
self.id = Worker.THREAD_ID
self.success = 0
self.setDaemon(True)
pass
def run(self):
print('>[{tid}] is live.'.format(tid=self.id))
request_data = self.request_helper.get_data()
action_data = self.action_queue.get_next()
while True:
if action_data is None:
sleep(5)
action_data = self.action_queue.get_next()
continue
url = action_data['url']
action = action_data['action']
# change ip after successful requests
if self.success == 10:
self.success = 0
request_data = self.request_helper.get_data()
try:
print('> [{id}] requests ({url}) with ({ip})'.format(id=self.id, url=url, ip=request_data['ip']))
r = requests.get(url=url, headers=request_data['user-agent'], cookies={}, proxies=request_data['proxy'],
timeout=7)
# success
if r.status_code == 200:
self.success += 1
soup = BeautifulSoup(r.text, 'lxml')
if action == TYPE_MASTER_LIST:
self._process_master_list(url, soup)
if action == TYPE_MASTER_PAGE:
self._process_master_page(url, soup)
print('> [{id}] finished - sleeping 3s.'.format(id=self.id))
sleep(3)
action_data = self.action_queue.get_next()
# too many requests
elif r.status_code == 429:
print('> [{id}] 429 fail - return action to queue - sleeping 5s.'.format(id=self.id))
sleep(5)
else:
print('> [{id}] Random ERROR: {error_code}'.format(id=self.id, error_code=r.status_code))
sleep(5)
except requests.exceptions.ConnectTimeout as e:
print('> [{id}] == ConnectTimeout == [{ex}] - return action to queue - sleeping 10s.'.format(id=self.id,
ex=str(e)))
request_data = self.request_helper.get_data()
sleep(10)
except Exception as e:
print('> [{id}] - random fail [{ex}].'.format(id=self.id, ex=str(e)))
sleep(10)
continue
def _process_master_list(self, url, soup):
print('> [{id}] - (1) processing {url}.'.format(id=self.id, url=url))
master_page_urls = [BASE_URL + url['href'] for url in soup.select('.cards > .card > h4 > a')]
for url in master_page_urls:
self.action_queue.put({'url': url, 'action': TYPE_MASTER_PAGE})
print('> [{id}] - added {cnt} master pages.'.format(id=self.id, cnt=str(len(master_page_urls))))
link = soup.select_one('.pagination_next')
if link is not None:
master_list_url = BASE_URL + link['href']
self.action_queue.put({'url': master_list_url, 'action': TYPE_MASTER_LIST})
print('> [{id}] - added 1 master pages list.'.format(id=self.id))
def _process_master_page(self, url, soup):
print('> [{id}] - (2) processing {url}.'.format(id=self.id, url=url))
Worker.MASTERS_DONE += 1
print(' >>>>>>>> ' + str(Worker.MASTERS_DONE))
Main
from worker import Worker
from actions import ActionQueue
from helper import RequestHelper
import time
BASE_URL = 'https://www.discogs.com'
TYPE_MASTER_LIST = 1
TYPE_MASTER_PAGE = 2
TYPE_RELEASE_PAGE = 3
def main():
actions = ActionQueue()
request_helper = RequestHelper()
actions.put({
'url': 'https://www.discogs.com/search/?limit=25&genre_exact=Hip+Hop&type=master&page=1&country_exact=Serbia',
'action': TYPE_MASTER_LIST
})
workers = []
for i in range(10):
workers.append(Worker(actions, request_helper))
for worker in workers:
worker.start()
while True:
continue
if __name__ == "__main__":
main()
Code runs correctly for short period of time, after which It displays max retries exceeded with url for each thread request, even after changing proxy.
[10] == ConnectTimeout == [HTTPSConnectionPool(host='www.discogs.com',
port=443): Max retries exceeded with url:
/Mar%C4%8Delo-Filteri-Deca-I-Sunce/master/640300 (Caused by
ConnectTimeoutError(, 'Connection to www.discogs.com timed out. (connect
timeout=7)'))] - return action to queue - sleeping 10s.
Each thread suffers from same exception.
Full execution log can be found here. I am using Python 3.6 under MacOS.

Python Multiproccessing Pool - Sharing one variable per process?

I have been trying to find a simple example where I share one constant variable per process launched in my process pool. Most examples show you how to share variables across processes, which is not what I want.
import multiprocessing
import time
data = (
{"var":1, "shared": None}, {"var":2, "shared": None}, {"var":3, "shared": None}, {"var":4, "shared": None}
)
def mp_worker(input):
print input
# print " Processs %s\tWaiting %s seconds" % (inputs, the_time)
# time.sleep(int(the_time))
# print " Process %s\tDONE" % inputs
def mp_handler():
p = multiprocessing.Pool(2)
p.map(mp_worker, data)
if __name__ == '__main__':
mp_handler()
For example, if I run this code, I would like to have my "shared" component intialized once for each process.
I would like to do something like this (This doesnt work):
from multiprocessing import Pool, Process
class Worker(Process):
def __init__(self):
print 'Worker started'
# do some initialization here
super(Worker, self).__init__()
def compute(self, data):
print 'Computing things!'
return data * data
if __name__ == '__main__':
# This works fine
worker = Worker()
#print worker.compute(3)
# workers get initialized fine
pool = Pool(processes = 4,
initializer = Worker)
data = range(10)
# How to use my worker pool?
# result = pool.map(Worker.compute, data)
result = pool.map(Worker.compute, data)
Using shared c_types:
from multiprocessing import Process, Lock
from multiprocessing.sharedctypes import Value
from ctypes import Structure, c_double
class Point(Structure):
_fields_ = [('x', c_double), ('y', c_double)]
def modify(parmMap):
parmMap['point'].x = parmMap['var']
parmMap['point'].y = parmMap['var'] * 2
if __name__ == '__main__':
lock = Lock()
data = ( {'var' : 1, 'shared' : Value(Point, (0,0), lock=lock) },
{'var' : 2, 'shared' : Value(Point, (0,0), lock=lock) },
{'var' : 3, 'shared' : Value(Point, (0,0), lock=lock) },
{'var' : 4, 'shared' : Value(Point, (0,0), lock=lock) }
)
p = multiprocessing.Pool(2)
print p.map(mp_worker, data)
print data
def init(args, num_gpu):
pid = int(str(multiprocessing.current_process()).split(" ")[0].split("-")[-1].split(",")[0]) - 1
gpu_id = pid % num_gpu
global testModule
testModule = TestModuleShared(args, gpu_id)
def worker(datum):
pid = int(str(multiprocessing.current_process()).split(" ")[0].split("-")[-1].split(",")[0]) - 1
params = datum["params"]
# print str(datum["fc"]) + " " + str(pid)
# print testModule.openpose
# Reset State
testModule.run()
p = multiprocessing.Pool(per_gpu_threads*num_gpu, initializer=init, initargs=(params["test_module_param"],num_gpu,))
It turns out you can just use the global variable keyword, along with an initializer callback to initialize it.

Python Multi-threading in a recordset

I have a database record set (approx. 1000 rows) and I am currently iterating through them, to integrate more data using extra db query for each record.
Doing that, raises the overall process time to maybe 100 seconds.
What I want to do is share the functionality to 2-4 processes.
I am using Python 2.7 to have AWS Lambda compatibility.
def handler(event, context):
try:
records = connection.get_users()
mandrill_client = open_mandrill_connection()
mandrill_messages = get_mandrill_messages()
mandrill_template = 'POINTS weekly-report-to-user'
start_time = time.time()
messages = build_messages(mandrill_messages, records)
print("OVERALL: %s seconds ---" % (time.time() - start_time))
send_mandrill_message(mandrill_client, mandrill_template, messages)
connection.close_database_connection()
return "Process Completed"
except Exception as e:
print(e)
Following is the function which I want to put into threads:
def build_messages(messages, records):
for record in records:
record = dict(record)
stream = get_user_stream(record)
data = compile_loyalty_stream(stream)
messages['to'].append({
'email': record['email'],
'type': 'to'
})
messages['merge_vars'].append({
'rcpt': record['email'],
'vars': [
{
'name': 'total_points',
'content': record['total_points']
},
{
'name': 'total_week',
'content': record['week_points']
},
{
'name': 'stream_greek',
'content': data['el']
},
{
'name': 'stream_english',
'content': data['en']
}
]
})
return messages
What I have tried is importing the multiprocessing library:
from multiprocessing.pool import ThreadPool
Created a pool inside the try block and mapped the function inside this pool:
pool = ThreadPool(4)
messages = pool.map(build_messages_in, itertools.izip(itertools.repeat(mandrill_messages), records))
def build_messages_in(a_b):
build_msg(*a_b)
def build_msg(a, b):
return build_messages(a, b)
def get_user_stream(record):
response = []
i = 0
for mod, mod_id, act, p, act_created in izip(record['models'], record['model_ids'], record['actions'],
record['points'], record['action_creation']):
information = get_reference(mod, mod_id)
if information:
response.append({
'action': act,
'points': p,
'created': act_created,
'info': information
})
if (act == 'invite_friend') \
or (act == 'donate') \
or (act == 'bonus_500_general') \
or (act == 'bonus_1000_general') \
or (act == 'bonus_500_cancel') \
or (act == 'bonus_1000_cancel'):
response[i]['info']['date_ref'] = act_created
response[i]['info']['slug'] = 'attiki'
if (act == 'bonus_500_general') \
or (act == 'bonus_1000_general') \
or (act == 'bonus_500_cancel') \
or (act == 'bonus_1000_cancel'):
response[i]['info']['title'] = ''
i += 1
return response
Finally I removed the for loop from the build_message function.
What I get as a results is a 'NoneType' object is not iterable.
Is this the correct way of doing this?
Your code seems pretty in-depth and so you cannot be sure that multithreading will lead to any performance gains when applied on a high level. Therefore, it's worth digging down to the point that gives you the largest latency and considering how to approach the specific bottleneck. See here for greater discussion on threading limitations.
If, for example as we discussed in comments, you can pinpoint a single task that is taking a long time, then you could try to parallelize it using multiprocessing instead - to leverage more of your CPU power. Here is a generic example that hopefully is simple enough to understand to mirror your Postgres queries without going into your own code base; I think that's an unfeasible amount of effort tbh.
import multiprocessing as mp
import time
import random
import datetime as dt
MAILCHIMP_RESPONSE = [x for x in range(1000)]
def chunks(l, n):
n = max(1, n)
return [l[i:i + n] for i in range(0, len(l), n)]
def db_query():
''' Delayed response from database '''
time.sleep(0.01)
return random.random()
def do_queries(query_list):
''' The function that takes all your query ids and executes them
sequentially for each id '''
results = []
for item in query_list:
query = db_query()
# Your super-quick processing of the Postgres response
processing_result = query * 2
results.append([item, processing_result])
return results
def single_processing():
''' As you do now - equivalent to get_reference '''
result_of_process = do_queries(MAILCHIMP_RESPONSE)
return result_of_process
def multi_process(chunked_data, queue):
''' Same as single_processing, except we put our results in queue rather
than returning them '''
result_of_process = do_queries(chunked_data)
queue.put(result_of_process)
def multiprocess_handler():
''' Divide and conquor on our db requests. We split the mailchimp response
into a series of chunks and fire our queries simultaneously. Thus, each
concurrent process has a smaller number of queries to make '''
num_processes = 4 # depending on cores/resources
size_chunk = len(MAILCHIMP_RESPONSE) / num_processes
chunked_queries = chunks(MAILCHIMP_RESPONSE, size_chunk)
queue = mp.Queue() # This is going to combine all the results
processes = [mp.Process(target=multi_process,
args=(chunked_queries[x], queue)) for x in range(num_processes)]
for p in processes: p.start()
divide_and_conquor_result = []
for p in processes:
divide_and_conquor_result.extend(queue.get())
return divide_and_conquor_result
if __name__ == '__main__':
start_single = dt.datetime.now()
single_process = single_processing()
print "Single process took {}".format(dt.datetime.now() - start_single)
print "Number of records processed = {}".format(len(single_process))
start_multi = dt.datetime.now()
multi = multiprocess_handler()
print "Multi process took {}".format(dt.datetime.now() - start_multi)
print "Number of records processed = {}".format(len(multi))

Categories