Issue with Multiprocessing script in terminal - python

When I tried to run my multiprocessing script in terminal, I keep getting this error message:
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
This is my script:
import time
from multiprocessing import Pool
from multiprocessing import freeze_support
import getpass
import jaydebeapi
import pandas as pd
import numpy as np
from multiprocessing import Process, freeze_support, set_start_method
def test(first_evnt, last_evnt):
PML_loan_Query = "select b.id_lpa_alt_loan from udbadm.pml_lst_cmpltd_trans_mtch a join udbadm.lpa_altv_loan_idtn b on a.id_evnt = b.id_evnt where b.cd_lpa_alt_loan_idtn = 'HewlettPackardGeneratedTransaction' and a.id_evnt BETWEEN ? AND ?"
PML.execute(PML_loan_Query,(first_evnt, last_evnt))
loan_records = PML.fetchall()
df = pd.DataFrame()
for x in loan_records:
# Populating the ODS table
#borr_query = "SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(100)) AS cd_idx, CAST(rate_curr_int AS INT) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS INT) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS INT) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS INT) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
borr_query = 'SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(10)) AS cd_idx, CAST(rate_curr_int AS VARCHAR(10)) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS VARCHAR(10)) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS VARCHAR(10)) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS VARCHAR(10)) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus IN (?)'
#borr_query = "SELECT DISTINCT nbr_aus FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
ODS.execute(borr_query, x)
#ODS.execute(ODS_list)
ODS_records = ODS.fetchall()
ODS_records = df.append(pd.DataFrame(ODS_records, columns = ['nbr_aus', 'cd_idx', 'rate_curr_int', 'rate_gr_mrtg_mrgn', 'rate_loln_max_cap', 'rate_perdc_cap']))
return ODS_records
if __name__ == '__main__':
freeze_support()
pw = getpass.getpass(prompt="Password", stream=False)
# establishing database to the ODS database
ODS = jaydebeapi.connect('com.ibm.db2.jcc.DB2Driver','jdbc:db2://he3qlxvtdbs351.fhlmc.com:50001/DB2QLTY', ['f408195', pw],'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the ODS database
ODS = ODS.cursor()
# creating the password needed to establish PML database connection
pw_2 = getpass.getpass(prompt="Password", stream=False)
# establishing database to the PML database
PML = jaydebeapi.connect('com.ibm.db2.jcc.DB2Driver','jdbc:db2://he3qlxvtdbs957.fhlmc.com:50001/PMLFDB2', ['f408195', pw_2],'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the PML database
PML = PML.cursor()
first_evnt = 155643917
last_evnt = 155684481
p = Pool()
result = p.map(test, [first_evnt, last_evnt])
print(result)
p.close()
p.join()

Related

How to create a high load trap receiver?

I need to process a very large number of traps (10,000 per second). I have the simplest linux server. I tried to implement through threads, but cpu clogs up very quickly. Please tell me how to minimize the load on memory and processor, but at the same time process a large number of traps?
There is also work with the database. Writing to the database of taps
from pysnmp.entity import engine, config
from pysnmp.carrier.asyncore.dgram import udp
from pysnmp.entity.rfc3413 import ntfrcv
import psycopg2
from pysnmp.hlapi import SnmpEngine as Sm, CommunityData, UdpTransportTarget,\
ContextData, ObjectType, ObjectIdentity, getCmd
from datetime import datetime
import logging.config
from os import getpid, system, stat, path, chdir, listdir, remove
from threading import Thread
snmpEngine = engine.SnmpEngine()
config.addTransport(
snmpEngine,
udp.domainName + (1,),
udp.UdpTransport().openServerMode(('localhost', 162))
)
config.addV1System(snmpEngine, '', 'public')
class cbFun(Thread):
def __init__(self, snmpEngine, stateReference, contextEngineId, contextName,
varBinds, cbCtx):
Thread.__init__(self)
self.snmpEngine = snmpEngine
self.stateReference = stateReference
self.contextEngineId = contextEngineId
self.contextName = contextName
self.varBinds = varBinds
self.cbCtx = cbCtx
self.localConnected = False
self.localDb = None
self.errorFlag = False
self.start()
def run(self):
print('\n{0}New trap message received on {1} {0}'.format(
'-' * 7,
datetime.now().strftime('%d-%b-%Y at %H:%M:%S')))
execContext = self.snmpEngine.observer.getExecutionContext(
'rfc3412.receiveMessage:request')
print('Trap is coming from %s:%s' % execContext['transportAddress'])
dict_traps = {}
for name, val in self.varBinds:
oid = name.prettyPrint()
value = val.prettyPrint()
print(f'{oid} = {value}')
dict_traps.update({oid: value})
connectDB(dict_traps)
def connectDB(self, values):
connect = psycopg2.connect(dbname="test", user="test",
password="test",
host="test")
cursor = connect.cursor()
for key,value in values:
command = f"insert into TRAPS VALUES ({key}, {value})"
cursor.execute(command)
connect.commit()
connect.close()
ntfrcv.NotificationReceiver(snmpEngine, cbFun)
snmpEngine.transportDispatcher.jobStarted(1)
try:
snmpEngine.transportDispatcher.runDispatcher()
except:
snmpEngine.transportDispatcher.closeDispatcher()
raise

Python multiprocessing make multiple api calls

i'm trying to speed up my code with multiprocessing and making multiple api calls at once. currently i'm making an api call, get the data needed from there and then insert them into the database. it works but it's very slow. i need to have about 700-800 million users in the database and at this current speed it will take about 200-250 days. how can i make multiple api calls?
import traceback
import requests
import json
import sys
from time import time, sleep
from multiprocessing import Process, Queue
from io import BytesIO
import imagehash
from PIL import Image
import sqlite3
from multiprocessing import Process
from multiprocessing import Pool as ThreadPool
min = 7960265729
max = 9080098567
database_location = 'D:/Script/steam_database.db'
key = []
pool_size = 32
image_hashes = []
def queue_flusher(queue, flush_limit=80, temp = 0):
connection = sqlite3.connect(database_location)
cursor = connection.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS user (id INTEGER PRIMARY KEY AUTOINCREMENT, hash TEXT, profile TEXT)")
connection.commit()
while True:
if(queue.qsize() < flush_limit):
sleep(.1)
else:
temp += 80
print(f"Flushing {flush_limit} out of queue {temp}")
queue_input = [queue.get() for _ in range(0, flush_limit)]
cursor = connection.cursor()
for row in queue_input:
if row['image'] not in image_hashes:
print(f"Inserting Row: {repr(row)}")
cursor.execute("INSERT INTO user (hash, profile) VALUES (?, ?);", (row['image'], row['profileUrl']))
image_hashes.append(row['image'])
connection.commit()
connection.close()
def databaseFiller(queue, min = 0, max = 0):
while True:
try:
for i in range(min, max):
r = requests.get(f'http://api.steampowered.com/ISteamUser/GetPlayerSummaries/v0002/?key={key[3]}&steamids=7656119{i}').json()
pool = ThreadPool(8)
all = pool.map(databaseFiller, i)
response = r
player = None
steamid = None
response = response.get('response', None)
if response is None or not response.get('players', None):
continue
player = response['players'][0]
pfp = player.get('avatar', None)
profileUrl = player.get('profileurl', None)
if pfp != "https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/fe/fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb.jpg":
img = requests.get(pfp)
img = Image.open(BytesIO(img.content))
image = str(imagehash.average_hash(img))
queue.put({'image': image, 'profileUrl': profileUrl})
except Exception as e:
# print(f'Received Response: {response}')
print("Printing only the traceback above the current stack frame")
print("".join(traceback.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])))
print("Printing the full traceback as if we had not caught it here...")
print(format_exception(e))
def format_exception(e):
exception_list = traceback.format_stack()
exception_list = exception_list[:-2]
exception_list.extend(traceback.format_tb(sys.exc_info()[2]))
exception_list.extend(traceback.format_exception_only(
sys.exc_info()[0], sys.exc_info()[1]))
exception_str = "Traceback (most recent call last):\n"
exception_str += "".join(exception_list)
exception_str = exception_str[:-1]
return exception_str
if __name__ == '__main__':
database_connection = sqlite3.connect("steam_database.db")
data_queue = Queue()
data_flush_process = Process(target=queue_flusher, args=([data_queue]))
data_flush_process.start()
total_nums = max - min
nums_per_process = total_nums // pool_size
for i in range(pool_size):
new_min = min + (nums_per_process * i)
new_max = max if i == (pool_size-1) else new_min + nums_per_process
Process(target=databaseFiller, args=([data_queue, new_min, new_max])).start()
thanks.
This will not solve 100% your problem but I see you are inserting text into the sqlite file, you should download the whole thing into e.g. a csv and the use execute cursor.executemany instead of cursor.execute. That insertion is faster.
How long does it take to make 1 download?

Python Multiprocessing queue is not getting all the elements to process

For all the Active campaigns, I have to query TSDB API for a date period to fetch data for each Campaign ID.so I get all the Campaign ids from the Db and put it to queue. In Db, I have 430 active campaign ids.
But python code is terminating after some 100 entries, don't know the reason, can somebody guide me here, but if I removed the API query fetching code and just prints the queue value get(q.get()), the Id value to fetch API is coming.
below is the code
import mysql.connector
from datetime import datetime,timedelta
from datetime import date
import requests
import json
from collections import OrderedDict
from multiprocessing import Pool, Queue
from os import getpid
from time import sleep
from random import random
db = mysql.connector.connect(
host='HOSTNAME',
database='DB',
user='ROOT',
password='PASSWORD',
port='PORT'
)
print("Connection ID:", db.connection_id)
MAX_WORKERS=10
class Testing_mp(object):
def __init__(self):
"""
Initiates a queue, a pool and a temporary buffer, used only
when the queue is full.
"""
self.q = Queue()
self.pool = Pool(processes=MAX_WORKERS, initializer=self.worker_main,)
self.temp_buffer = []
def add_to_queue(self, msg):
"""
If queue is full, put the message in a temporary buffer.
If the queue is not full, adding the message to the queue.
If the buffer is not empty and that the message queue is not full,
putting back messages from the buffer to the queue.
"""
if self.q.full():
print("QISFULL",msg)
self.temp_buffer.append(msg)
else:
self.q.put(msg)
if len(self.temp_buffer) > 0:
add_to_queue(self.temp_buffer.pop())
def write_to_queue(self):
"""
This function writes some messages to the queue.
"""
mycursor = db.cursor()
mycursor.execute("select Id from Campaign where Status='ACTIVE' order by Id desc")
myresult = mycursor.fetchall()
for x in myresult:
self.add_to_queue(x[3])
sleep(random()*2)
db.close() # close the connection
def worker_main(self):
"""
Waits indefinitely for an item to be written in the queue.
Finishes when the parent process terminates.
"""
print "Process {0} started".format(getpid())
while True:
# If queue is not empty, pop the next element and do the work.
# If queue is empty, wait indefinitly until an element get in the queue.
item = self.q.get(block=True,timeout=None)
start_date=datetime.today()
start_date=start_date.date()
end_date = start_date - timedelta(days=8)
start_date = start_date - timedelta(days=1)
print "{0} retrieved: {1}".format(getpid(), item)
#print("STARTDATE",type(start_date))
start_date_ft=start_date.strftime('%Y/%m/%d')
##print("ENDDATE",end_date)
end_date_ft=end_date.strftime('%Y/%m/%d')
url = "http://tsdb.metrics.com:4343/api/query"
if item is not None:
querystring = {"start":end_date_ft,"end":start_date_ft,"m":"avg:1d-avg:percentization{campaign="+str(item)+",type=seen}"}
print(querystring)
response = requests.request("GET", url,params=querystring)
print(response.text)
if response and response.text is not None:
loaded_json = json.loads(response.text,object_pairs_hook=OrderedDict)
for x in loaded_json:
for attribute, value in x.items():
if attribute is not None and attribute=="dps":
dps_data=loaded_json[0][attribute]
perValue=[]
if len(dps_data)>0:
for key,val in dps_data.items():
perValue.append(str(val))
print(str(item)+"==ITEM=="+key+"="+str(val))
print(perValue)
# simulate some random length operations
sleep(random()*1)
# Warning from Python documentation:
# Functionality within this package requires that the __main__ module be
# importable by the children. This means that some examples, such as the
# multiprocessing.Pool examples will not work in the interactive interpreter.
if __name__ == '__main__':
mp_class = Testing_mp()
mp_class.write_to_queue()
# Waits a bit for the child processes to do some work
# because when the parent exits, childs are terminated.
sleep(5)

pickling issue while using pool to count check the file

I have my code that is sprawning multiple processes to check the count of files and maintaining the records in the database. The code which is working is mentioned below :
import multiprocessing as mp
from multiprocessing import Pool
import os
import time
import mysql.connector
"""Function to check the count of the file"""
def file_wc(fname):
with open('/home/vaibhav/Desktop/Input_python/'+ fname) as f:
count = sum(1 for line in f)
return (fname,count)
class file_audit:
def __init__(self):
"""Initialising the constructor for getting the names of files
and refrencing the outside class function"""
folder = '/home/vaibhav/Desktop/Input_python'
self.fnames = (name for name in os.listdir(folder))
self.file_wc=file_wc
def count_check(self):
"Creating 4 worker threads to check the count of the file parallelly"
pool = Pool(4)
self.m=list(pool.map(self.file_wc, list(self.fnames),4))
pool.close()
pool.join()
def database_updation(self):
"""To maintain an entry in the database with details
like filename and recrods present in the file"""
self.db = mysql.connector.connect(host="localhost",user="root",password="root",database="python_showtime" )
# prepare a cursor object using cursor() method
self.cursor = self.db.cursor()
query_string = ("INSERT INTO python_showtime.audit_capture"
"(name,records)"
"VALUES(%s,%s)")
#data_user = (name,records)
for each in self.m:
self.cursor.execute(query_string, each)
self.db.commit()
self.cursor.close()
start_time = time.time()
print("My program took", time.time() - start_time, "to run")
#if __name__ == '__main__':
x=file_audit()
x.count_check() #To check the count by sprawning multiple processes
x.database_updation() #To maintain the entry in the database
Point to be considered
Now if i put my function inside the class and comment self.file_wc=file_wc in the constructor section i get the Error can't pickle on generator objects. I got some fair understanding like we cannot pickle some objects,So want to know what exactly is happening at the background in very simple terms. I got the reference from here or here to make the code working

python cassandra driver same insert performance as copy

I'm trying to use Python async with Cassandra to see if I can write records to Cassandra faster than the CQL COPY command.
My python code looks like this:
from cassandra.cluster import Cluster
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
cluster = Cluster(['1.2.1.4'])
session = cluster.connect('test')
with open('dataImport.txt') as f:
for line in f:
query = SimpleStatement (
"INSERT INTO tstTable (id, accts, info) VALUES (%s) " %(line),
consistency_level=ConsistencyLevel.ONE)
session.execute_async (query)
but its giving me the same performance as the COPY command...around 2,700 rows/sec....should it be faster with async?
Do I need to use multithreading in python? Just reading about it but not sure how it fits into this...
EDIT:
so I found something online that i'm trying to modify but can't get to quite work...I have this so far..also I split the file into 3 file into /Data/toImport/ dir:
import multiprocessing
import time
import os
from cassandra.cluster import Cluster
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
cluster = Cluster(['1.2.1.4'])
session = cluster.connect('test')
def mp_worker(inputArg):
with open(inputArg[0]) as f:
for line in f:
query = SimpleStatement (
"INSERT INTO CustInfo (cust_id, accts, offers) values (%s)" %(line),
consistency_level=ConsistencyLevel.ONE)
session.execute_async (query)
def mp_handler(inputData, nThreads = 8):
p = multiprocessing.Pool(nThreads)
p.map(mp_worker, inputData, chunksize=1)
p.close()
p.join()
if __name__ == '__main__':
temp_in_data = file_list
start = time.time()
in_dir = '/Data/toImport/'
N_Proc = 8
file_data = [(in_dir) for i in temp_in_data]
print '----------------------------------Start Working!!!!-----------------------------'
print 'Number of Processes using: %d' %N_Proc
mp_handler(file_data, N_Proc)
end = time.time()
time_elapsed = end - start
print '----------------------------------All Done!!!!-----------------------------'
print "Time elapsed: {} seconds".format(time_elapsed)
but get this error:
Traceback (most recent call last):
File "multiCass.py", line 27, in <module>
temp_in_data = file_list
NameError: name 'file_list' is not defined
This post A Multiprocessing Example for Improved Bulk Data Throughput provides all the details needed to improve the performance of bulk data ingestion. Basically there are 3 mechanisms and additional tuning can be done based on your use-case & hw:
single process (that's the case in your example)
multi-processing single queries
multi-processing concurrent queries
Size of batches and concurrency are the variables you'll have to play with yourself.
got it working like this:
import multiprocessing
import time
import os
from cassandra.cluster import Cluster
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
def mp_worker(inputArg):
cluster = Cluster(['1.2.1.4'])
session = cluster.connect('poc')
with open(inputArg[0]) as f:
for line in f:
query = SimpleStatement (
"INSERT INTO testTable (cust_id, accts, offers) values (%s)" %(line),
consistency_level=ConsistencyLevel.ONE)
session.execute_async (query)
def mp_handler(inputData, nThreads = 8):
p = multiprocessing.Pool(nThreads)
p.map(mp_worker, inputData, chunksize=1)
p.close()
p.join()
if __name__ == '__main__':
temp_in_data = ['/toImport/part-00000', '/toImport/part-00001', '/toImport/part-00002']
start = time.time()
N_Proc = 3
file_data = [(i,) for i in temp_in_data]
print '----------------------------------Start Working!!!!-----------------------------'
print 'Number of Processes using: %d' %N_Proc
mp_handler(file_data, N_Proc)
end = time.time()
time_elapsed = end - start
print '----------------------------------All Done!!!!-----------------------------'
print "Time elapsed: {} seconds".format(time_elapsed)

Categories