i'm trying to speed up my code with multiprocessing and making multiple api calls at once. currently i'm making an api call, get the data needed from there and then insert them into the database. it works but it's very slow. i need to have about 700-800 million users in the database and at this current speed it will take about 200-250 days. how can i make multiple api calls?
import traceback
import requests
import json
import sys
from time import time, sleep
from multiprocessing import Process, Queue
from io import BytesIO
import imagehash
from PIL import Image
import sqlite3
from multiprocessing import Process
from multiprocessing import Pool as ThreadPool
min = 7960265729
max = 9080098567
database_location = 'D:/Script/steam_database.db'
key = []
pool_size = 32
image_hashes = []
def queue_flusher(queue, flush_limit=80, temp = 0):
connection = sqlite3.connect(database_location)
cursor = connection.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS user (id INTEGER PRIMARY KEY AUTOINCREMENT, hash TEXT, profile TEXT)")
connection.commit()
while True:
if(queue.qsize() < flush_limit):
sleep(.1)
else:
temp += 80
print(f"Flushing {flush_limit} out of queue {temp}")
queue_input = [queue.get() for _ in range(0, flush_limit)]
cursor = connection.cursor()
for row in queue_input:
if row['image'] not in image_hashes:
print(f"Inserting Row: {repr(row)}")
cursor.execute("INSERT INTO user (hash, profile) VALUES (?, ?);", (row['image'], row['profileUrl']))
image_hashes.append(row['image'])
connection.commit()
connection.close()
def databaseFiller(queue, min = 0, max = 0):
while True:
try:
for i in range(min, max):
r = requests.get(f'http://api.steampowered.com/ISteamUser/GetPlayerSummaries/v0002/?key={key[3]}&steamids=7656119{i}').json()
pool = ThreadPool(8)
all = pool.map(databaseFiller, i)
response = r
player = None
steamid = None
response = response.get('response', None)
if response is None or not response.get('players', None):
continue
player = response['players'][0]
pfp = player.get('avatar', None)
profileUrl = player.get('profileurl', None)
if pfp != "https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/fe/fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb.jpg":
img = requests.get(pfp)
img = Image.open(BytesIO(img.content))
image = str(imagehash.average_hash(img))
queue.put({'image': image, 'profileUrl': profileUrl})
except Exception as e:
# print(f'Received Response: {response}')
print("Printing only the traceback above the current stack frame")
print("".join(traceback.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])))
print("Printing the full traceback as if we had not caught it here...")
print(format_exception(e))
def format_exception(e):
exception_list = traceback.format_stack()
exception_list = exception_list[:-2]
exception_list.extend(traceback.format_tb(sys.exc_info()[2]))
exception_list.extend(traceback.format_exception_only(
sys.exc_info()[0], sys.exc_info()[1]))
exception_str = "Traceback (most recent call last):\n"
exception_str += "".join(exception_list)
exception_str = exception_str[:-1]
return exception_str
if __name__ == '__main__':
database_connection = sqlite3.connect("steam_database.db")
data_queue = Queue()
data_flush_process = Process(target=queue_flusher, args=([data_queue]))
data_flush_process.start()
total_nums = max - min
nums_per_process = total_nums // pool_size
for i in range(pool_size):
new_min = min + (nums_per_process * i)
new_max = max if i == (pool_size-1) else new_min + nums_per_process
Process(target=databaseFiller, args=([data_queue, new_min, new_max])).start()
thanks.
This will not solve 100% your problem but I see you are inserting text into the sqlite file, you should download the whole thing into e.g. a csv and the use execute cursor.executemany instead of cursor.execute. That insertion is faster.
How long does it take to make 1 download?
Related
I need to process a very large number of traps (10,000 per second). I have the simplest linux server. I tried to implement through threads, but cpu clogs up very quickly. Please tell me how to minimize the load on memory and processor, but at the same time process a large number of traps?
There is also work with the database. Writing to the database of taps
from pysnmp.entity import engine, config
from pysnmp.carrier.asyncore.dgram import udp
from pysnmp.entity.rfc3413 import ntfrcv
import psycopg2
from pysnmp.hlapi import SnmpEngine as Sm, CommunityData, UdpTransportTarget,\
ContextData, ObjectType, ObjectIdentity, getCmd
from datetime import datetime
import logging.config
from os import getpid, system, stat, path, chdir, listdir, remove
from threading import Thread
snmpEngine = engine.SnmpEngine()
config.addTransport(
snmpEngine,
udp.domainName + (1,),
udp.UdpTransport().openServerMode(('localhost', 162))
)
config.addV1System(snmpEngine, '', 'public')
class cbFun(Thread):
def __init__(self, snmpEngine, stateReference, contextEngineId, contextName,
varBinds, cbCtx):
Thread.__init__(self)
self.snmpEngine = snmpEngine
self.stateReference = stateReference
self.contextEngineId = contextEngineId
self.contextName = contextName
self.varBinds = varBinds
self.cbCtx = cbCtx
self.localConnected = False
self.localDb = None
self.errorFlag = False
self.start()
def run(self):
print('\n{0}New trap message received on {1} {0}'.format(
'-' * 7,
datetime.now().strftime('%d-%b-%Y at %H:%M:%S')))
execContext = self.snmpEngine.observer.getExecutionContext(
'rfc3412.receiveMessage:request')
print('Trap is coming from %s:%s' % execContext['transportAddress'])
dict_traps = {}
for name, val in self.varBinds:
oid = name.prettyPrint()
value = val.prettyPrint()
print(f'{oid} = {value}')
dict_traps.update({oid: value})
connectDB(dict_traps)
def connectDB(self, values):
connect = psycopg2.connect(dbname="test", user="test",
password="test",
host="test")
cursor = connect.cursor()
for key,value in values:
command = f"insert into TRAPS VALUES ({key}, {value})"
cursor.execute(command)
connect.commit()
connect.close()
ntfrcv.NotificationReceiver(snmpEngine, cbFun)
snmpEngine.transportDispatcher.jobStarted(1)
try:
snmpEngine.transportDispatcher.runDispatcher()
except:
snmpEngine.transportDispatcher.closeDispatcher()
raise
Need some help to set the configuration for sasl.mechanism PLAIN (API) and GSSAPI (Kerberos) authentication.
We are using confluent Kafka here, there are two scripts, one a python script and the second one is a bash script which calls the python one. You can find the script below.
Thanks for the help in advance!
import json
import os
import string
import random
import socket
import uuid
import re
from datetime import datetime
import time
import hashlib
import math
import sys
from functools import cache
from confluent_kafka import Producer, KafkaError, KafkaException
topic_name = os.environ['TOPIC_NAME']
partition_count = int(os.environ['PARTITION_COUNT'])
message_key_template = json.loads(os.environ['KEY_TEMPLATE'])
message_value_template = json.loads(os.environ['VALUE_TEMPLATE'])
message_header_template = json.loads(os.environ['HEADER_TEMPLATE'])
bootstrap_servers = os.environ['BOOTSTRAP_SERVERS']
perf_counter_batch_size = int(os.environ.get('PERF_COUNTER_BATCH_SIZE', 100))
messages_per_aggregate = int(os.environ.get('MESSAGES_PER_AGGREGATE', 1))
max_message_count = int(os.environ.get('MAX_MESSAGE_COUNT', sys.maxsize))
def error_cb(err):
""" The error callback is used for generic client errors. These
errors are generally to be considered informational as the client will
automatically try to recover from all errors, and no extra action
is typically required by the application.
For this example however, we terminate the application if the client
is unable to connect to any broker (_ALL_BROKERS_DOWN) and on
authentication errors (_AUTHENTICATION). """
print("Client error: {}".format(err))
if err.code() == KafkaError._ALL_BROKERS_DOWN or \
err.code() == KafkaError._AUTHENTICATION:
# Any exception raised from this callback will be re-raised from the
# triggering flush() or poll() call.
raise KafkaException(err)
def acked(err, msg):
if err is not None:
print("Failed to send message: %s: %s" % (str(msg), str(err)))
producer_configs = {
'bootstrap.servers': bootstrap_servers,
'client.id': socket.gethostname(),
'error_cb': error_cb
}
# TODO: Need to support sasl.mechanism PLAIN (API) and GSSAPI (Kerberos) authentication.
# TODO: Need to support truststores for connecting to private DCs.
producer = Producer(producer_configs)
# generates a random value if it is not cached in the template_values dictionary
def get_templated_value(term, template_values):
if not term in template_values:
template_values[term] = str(uuid.uuid4())
return template_values[term]
def fill_template_value(value, template_values):
str_value = str(value)
template_regex = '{{(.+?)}}'
templated_terms = re.findall(template_regex, str_value)
for term in templated_terms:
str_value = str_value.replace(f"{{{{{term}}}}}", get_templated_value(term, template_values))
return str_value
def fill_template(template, templated_terms):
# TODO: Need to address metadata field, as it's treated as a string instead of a nested object.
return {field: fill_template_value(value, templated_terms) for field, value in template.items()}
#cache
def get_partition(lock_id):
bits = 128
bucket_size = 2**bits / partition_count
partition = (int(hashlib.md5(lock_id.encode('utf-8')).hexdigest(), 16) / bucket_size)
return math.floor(partition)
sequence_number = int(time.time() * 1000)
sequence_number = 0
message_count = 0
producing = True
start_time = time.perf_counter()
aggregate_message_counter = 0
# cache for templated term values so that they match across the different templates
templated_values = {}
try:
while producing:
sequence_number += 1
aggregate_message_counter += 1
message_count += 1
if aggregate_message_counter % messages_per_aggregate == 0:
# reset templated values
templated_values = {}
else:
for term in list(templated_values):
if term not in ['aggregateId', 'tenantId']:
del(templated_values[term])
# Fill in templated field values
message_key = fill_template(message_key_template, templated_values)
message_value = fill_template(message_value_template, templated_values)
message_header = fill_template(message_header_template, templated_values)
ts = datetime.utcnow().isoformat()[:-3]+'Z'
message_header['timestamp'] = ts
message_header['sequence_number'] = str(sequence_number)
message_value['timestamp'] = ts
message_value['sequenceNumber'] = sequence_number
lock_id = message_header['lock_id']
partition = get_partition(lock_id) # partition by lock_id, since key could be random, but a given aggregate_id should ALWAYS resolve to the same partition, regardless of key.
# Send message
producer.produce(topic_name, partition=partition, key=json.dumps(message_key), value=json.dumps(message_value), headers=message_header, callback=acked)
if sequence_number % perf_counter_batch_size == 0:
producer.flush()
end_time = time.perf_counter()
total_duration = end_time - start_time
messages_per_second=(perf_counter_batch_size/total_duration)
print(f'{messages_per_second} messages/second')
# reset start time
start_time = time.perf_counter()
if message_count >= max_message_count:
break
except Exception as e:
print(f'ERROR: %s' % e)
sys.exit(1)
finally:
producer.flush()
Given a list of data to process and a 64-core CPU (plus 500 GB RAM).
The list should sort strings and store data in a result set of millions of records, which runs just fine, takes a few seconds with multiprocessing.
But I'd also need to store the result somehow, either in a txt, csv output or a database. So far I haven't found a viable solution, because after the first part (process), the insert method either gives an error with trying it with MySQL pooling, or takes an insanely long time giving the txt output.
What Ive tried so far: simple txt output, print out to txt file, using csv, pandas and numpy libs. Nothing seems to speed it up. Any help would be greatly appreciated!
My code right now:
import os
import re
import datetime
import time
import csv
import mysql.connector as connector
from mysql.connector.pooling import MySQLConnectionPool
import mysql
import numpy as np
from tqdm import tqdm
from time import sleep
import multiprocessing as mp
import numpy
pool = MySQLConnectionPool( pool_name="sql_pool",
pool_size=32,
pool_reset_session=True,
host="localhost",
port="3306",
user="homestead",
password="secret",
database="homestead")
# # sql connection
db = mysql.connector.connect(
host="localhost",
port="3306",
user="homestead",
password="secret",
database="homestead"
)
sql_cursor = db.cursor()
delete_statement = "DELETE FROM statistics"
sql_cursor.execute(delete_statement)
db.commit()
sql_statement = "INSERT INTO statistics (name, cnt) VALUES (%s, %s)"
list = []
domains = mp.Manager().list()
unique_list = mp.Manager().list()
invalid_emails = mp.Manager().list()
result = mp.Manager().list()
regex_email = '^(\w|\.|\_|\-)+[#](\w|\_|\-|\.)+[.]\w{2,3}$'
# check email validity
def check(list, email):
if(re.search(regex_email, email)):
domains.append(email.lower().split('#')[1])
return True
else:
invalid_emails.append(email)
return False
#end of check email validity
# execution time converter
def convertTime(seconds):
seconds = seconds % (24 * 3600)
hour = seconds // 3600
seconds %= 3600
minutes = seconds // 60
seconds %= 60
if(hour == 0):
if(minutes == 0):
return "{0} sec".format(seconds)
else:
return "{0}min {1}sec".format(minutes, seconds)
else:
return "{0}hr {1}min {2}sec".format(hour, minutes, seconds)
# execution time converter end
#process
def process(list):
for item in tqdm(list):
if(check(list, item)):
item = item.lower().split('#')[1]
if item not in unique_list:
unique_list.append(item)
# end of process
def insert(list):
global sql_statement
# Add to db
con = pool.get_connection()
cur = con.cursor()
print("PID %d: using connection %s" % (os.getpid(), con))
#cur.executemany(sql_statement, sorted(map(set_result, list)))
for item in list:
cur.execute(sql_statement, (item, domains.count(item)))
con.commit()
cur.close()
con.close()
# def insert_into_database(list):
#sql_cursor.execute(sql_statement, (unique_list, 1), multi=True)
# sql_cursor.executemany(sql_statement, sorted(map(set_result, list)))
# db.commit()
# statistics
def statistics(list):
for item in tqdm(list):
if(domains.count(item) > 0):
result.append([domains.count(item), item])
# end of statistics
params = sys.argv
filename = ''
process_count = -1
for i, item in enumerate(params):
if(item.endswith('.txt')):
filename = item
if(item == '--top'):
process_count = int(params[i+1])
def set_result(item):
return item, domains.count(item)
# main
if(filename):
try:
start_time = time.time()
now = datetime.datetime.now()
dirname = "email_stats_{0}".format(now.strftime("%Y%m%d_%H%M%S"))
os.mkdir(dirname)
list = open(filename).read().split()
if(process_count == -1):
process_count = len(list)
if(process_count > 0):
list = list[:process_count]
#chunking list
n = int(len(list) / mp.cpu_count())
chunks = [list[i:i + n] for i in range(0, len(list), n)]
processes = []
print('Processing list on {0} cores...'.format(mp.cpu_count()))
for chunk in chunks:
p = mp.Process(target=process, args=[chunk])
p.start()
processes.append(p)
for p in processes:
p.join()
# insert(unique_list)
## step 2 - write sql
## Clearing out db before new data insert
con = pool.get_connection()
cur = con.cursor()
delete_statement = "DELETE FROM statistics"
cur.execute(delete_statement)
u_processes = []
#Maximum pool size for sql is 32, so maximum chunk number should be that too.
if(mp.cpu_count() < 32):
n2 = int(len(unique_list) / mp.cpu_count())
else:
n2 = int(len(unique_list) / 32)
u_chunks = [unique_list[i:i + n2] for i in range(0, len(unique_list), n2)]
for u_chunk in u_chunks:
p = mp.Process(target=insert, args=[u_chunk])
p.start()
u_processes.append(p)
for p in u_processes:
p.join()
for p in u_processes:
p.close()
# sql_cursor.executemany(sql_statement, sorted(map(set_result, unique_list)))
# db.commit()
# for item in tqdm(unique_list):
# sql_val = (item, domains.count(item))
# sql_cursor.execute(sql_statement, sql_val)
#
# db.commit()
## numpy.savetxt('saved.txt', sorted(map(set_result, unique_list)), fmt='%s')
# with(mp.Pool(mp.cpu_count(), initializer = db) as Pool:
# Pool.map_async(insert_into_database(),set(unique_list))
# Pool.close()
# Pool.join()
print('Creating statistics for {0} individual domains...'.format(len(unique_list)))
# unique_list = set(unique_list)
# with open("{0}/result.txt".format(dirname), "w+") as f:
# csv.writer(f).writerows(sorted(map(set_result, unique_list), reverse=True))
print('Writing final statistics...')
print('OK.')
f = open("{0}/stat.txt".format(dirname),"w+")
f.write("Number of processed emails: {0}\r\n".format(process_count))
f.write("Number of valid emails: {0}\r\n".format(len(list) - len(invalid_emails)))
f.write("Number of invalid emails: {0}\r\n".format(len(invalid_emails)))
f.write("Execution time: {0}".format(convertTime(int(time.time() - start_time))))
f.close()
except FileNotFoundError:
print('File not found, path or file broken.')
else:
print('Wrong file format, should be a txt file.')
# main
See my comments regarding some changes you might wish to make, one of which might improve performance. But I think one area of performance which could really be improved is in your use of managed lists. These are represented by proxies and each operation on such a list is essentially a remote procedure call and thus very slow. You cannot avoid this given that you need to have multiple processes updating a common, shared lists (or dict if you take my suggestion). But in the main process you might be trying, for example, to construct a set from a shared list as follows:
Pool.map_async(insert_into_database(),set(unique_list))
(by the way, that should be Pool.map(insert_into_database, set(unique_list)), i.e. you have an extra set of () and you can then get rid of the calls to pool.close() and pool.join() if you wish)
The problem is that you are iterating every element of unique_list through a proxy, which might be what is taking a very long time. I say "might" because I would think the use of managed lists would prevent the code as is, i.e. without outputting the results, from completing in "a few seconds" if we are talking about "millions" of records and thus millions of remote procedure calls. But this number could certainly be reduced if you could somehow get the underlying list as a native list.
First, you need to heed my comment about having declared a variable named list thus making it impossible to create native lists or subclasses of list. Once your have renamed that variable to something more reasonable, we can create our own managed class MyList that will expose the underlying list on which it is built. Note that you can do the same thing with a MyDict class that subclasses dict. I have defined both classes for you. Here is a benchmark showing the difference between constructing a native list from a managed list versus creating a native list from a MyList:
import multiprocessing as mp
from multiprocessing.managers import BaseManager
import time
class MyManager(BaseManager):
pass
class MyList(list):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_underlying_list(self):
return self
class MyDict(dict):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_underlying_dict(self):
return self
# required for windows, which I am running on:
if __name__ == '__main__':
l = mp.Manager().list()
for i in range(100_000):
l.append(i)
t = time.time()
l2 = list(l)
print(time.time() - t, l2[0:5], l2[-5:])
MyManager.register('MyList', MyList)
MyManager.register('MyDict', MyDict)
my_manager = MyManager()
# must explicitly start the manager or use: with MyManager() as manager:
my_manager.start()
l = my_manager.MyList()
for i in range(100_000):
l.append(i)
t = time.time()
l2 = list(l.get_underlying_list())
print(time.time() - t, l2[0:5], l2[-5:])
Prints:
7.3949973583221436 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]
0.007997751235961914 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]
For all the Active campaigns, I have to query TSDB API for a date period to fetch data for each Campaign ID.so I get all the Campaign ids from the Db and put it to queue. In Db, I have 430 active campaign ids.
But python code is terminating after some 100 entries, don't know the reason, can somebody guide me here, but if I removed the API query fetching code and just prints the queue value get(q.get()), the Id value to fetch API is coming.
below is the code
import mysql.connector
from datetime import datetime,timedelta
from datetime import date
import requests
import json
from collections import OrderedDict
from multiprocessing import Pool, Queue
from os import getpid
from time import sleep
from random import random
db = mysql.connector.connect(
host='HOSTNAME',
database='DB',
user='ROOT',
password='PASSWORD',
port='PORT'
)
print("Connection ID:", db.connection_id)
MAX_WORKERS=10
class Testing_mp(object):
def __init__(self):
"""
Initiates a queue, a pool and a temporary buffer, used only
when the queue is full.
"""
self.q = Queue()
self.pool = Pool(processes=MAX_WORKERS, initializer=self.worker_main,)
self.temp_buffer = []
def add_to_queue(self, msg):
"""
If queue is full, put the message in a temporary buffer.
If the queue is not full, adding the message to the queue.
If the buffer is not empty and that the message queue is not full,
putting back messages from the buffer to the queue.
"""
if self.q.full():
print("QISFULL",msg)
self.temp_buffer.append(msg)
else:
self.q.put(msg)
if len(self.temp_buffer) > 0:
add_to_queue(self.temp_buffer.pop())
def write_to_queue(self):
"""
This function writes some messages to the queue.
"""
mycursor = db.cursor()
mycursor.execute("select Id from Campaign where Status='ACTIVE' order by Id desc")
myresult = mycursor.fetchall()
for x in myresult:
self.add_to_queue(x[3])
sleep(random()*2)
db.close() # close the connection
def worker_main(self):
"""
Waits indefinitely for an item to be written in the queue.
Finishes when the parent process terminates.
"""
print "Process {0} started".format(getpid())
while True:
# If queue is not empty, pop the next element and do the work.
# If queue is empty, wait indefinitly until an element get in the queue.
item = self.q.get(block=True,timeout=None)
start_date=datetime.today()
start_date=start_date.date()
end_date = start_date - timedelta(days=8)
start_date = start_date - timedelta(days=1)
print "{0} retrieved: {1}".format(getpid(), item)
#print("STARTDATE",type(start_date))
start_date_ft=start_date.strftime('%Y/%m/%d')
##print("ENDDATE",end_date)
end_date_ft=end_date.strftime('%Y/%m/%d')
url = "http://tsdb.metrics.com:4343/api/query"
if item is not None:
querystring = {"start":end_date_ft,"end":start_date_ft,"m":"avg:1d-avg:percentization{campaign="+str(item)+",type=seen}"}
print(querystring)
response = requests.request("GET", url,params=querystring)
print(response.text)
if response and response.text is not None:
loaded_json = json.loads(response.text,object_pairs_hook=OrderedDict)
for x in loaded_json:
for attribute, value in x.items():
if attribute is not None and attribute=="dps":
dps_data=loaded_json[0][attribute]
perValue=[]
if len(dps_data)>0:
for key,val in dps_data.items():
perValue.append(str(val))
print(str(item)+"==ITEM=="+key+"="+str(val))
print(perValue)
# simulate some random length operations
sleep(random()*1)
# Warning from Python documentation:
# Functionality within this package requires that the __main__ module be
# importable by the children. This means that some examples, such as the
# multiprocessing.Pool examples will not work in the interactive interpreter.
if __name__ == '__main__':
mp_class = Testing_mp()
mp_class.write_to_queue()
# Waits a bit for the child processes to do some work
# because when the parent exits, childs are terminated.
sleep(5)
I'm getting an arguments error from the MySQL module looking for 4 operands. I just don't see which operands it's looking for. It works for some case types and not others. Line numbers of error (trimmed at bottom which refers to MySQL library):
Traceback (most recent call last):
File "/Users/christoph/PycharmProjects/physicianWorkQueueProject/physicianWorkQueueProject.py", line 158, in <module>
parse()
File "/Users/christoph/PycharmProjects/physicianWorkQueueProject/physicianWorkQueueProject.py", line 138, in parse
c.execute(getPhysiciansql_cmd)
...
mysql.connector.errors.DataError: 1241 (21000): Operand should contain 4 column(s)
#!/usr/bin/python
##################################################
# This is a prototype pathologist work queue management system
# The program calls out to a database of pathologists, their specialities,
# and an index of case types and their intended specialists
#
# Usage: Run at your command line. You will then enter case numbers (which aren't validated (currently))
# and case types (which are validated). The program will distribute the entered case as follows:
#
# If the case is intended for generalists, a system that amounts to names being pulled from a hat
# is employed. When a name is selected (at random), the case is entered into that pathologists' work queue.
# That pathologists' name is not returned to the pool.
# The cycle will then repeat with a random name picked every time, in this same way, for generalist-requiring cases,
# until the entirety of names have been pulled. At this point, all names are returned to the pool and the whole cycle
# begins again
#
# For the specialist requiring cases, the name-draw system is bypassed and the case is directly entered
# into the pathologists' work queue.
# 16 Aug 2018
# My Real Name
##################################################
import mysql.connector as mariaDB
import time
import pandas as pd
from random import choice
def distributefairly(inputCaseNumber, inputcasetype):
# function distrbutefairly does a draw out of a hat, with each name being pulled and the pot shrinking until none are
# left at which point all names are added back
c.execute("SELECT physicianName FROM physicianNamesSpecialties;")
originalPhysicianList = c.fetchall()
physicianList = originalPhysicianList
# print("counter at:",count)
chosenPhysician = choice(physicianList)
pos = physicianList.index(chosenPhysician)
physicianList.pop(pos)
cleanedUpChosenPhysician = chosenPhysician[0]
insert(cleanedUpChosenPhysician)
print("This case is going to", cleanedUpChosenPhysician + ".")
select()
increment()
if len(physicianList) == 0:
reset()
def reset():
# this resets the counter the distributefairly module calls
global count
global physicianList
count = 0
c.execute("SELECT physicianName FROM physicianNamesSpecialties;")
physicianList = c.fetchall()
def increment():
# increments the counter of the distributefairly module
global count
count +=
def print_count():
print(count)
def insert(cleanedUpPhysResult):
# adds inputted cases into the workQueue table
global inputCaseNumber
global inputCaseType
ts = time.gmtime()
readableTs = time.strftime("%Y-%m-%d %H:%M:%S", ts)
c2.execute("INSERT INTO workQueue (name, caseNumber, timestamp, tableCaseType) values (%s,%s,%s,%s)", (cleanedUpPhysResult, inputCaseNumber, readableTs, str(inputCaseType)))
conn2.commit()
def select():
# this outputs the workQueue after every addition
sql = "SELECT * FROM workQueue"
c2.execute(sql)
rWQ = c2.fetchall()
print(pd.DataFrame(rWQ, columns= ['Name','Case Number','Time Stamp','Specialty','Row ID'])) # .set_index('Row ID')
def startup():
# create()
global inputCaseType
global inputCaseNumber
inputCaseNumber = input("Enter Case Number: ")
inputCaseType = input("Enter case type (use proper abbreviations): ")
def parse():
global inputCaseNumber
global inputCaseType
caseInputsql_cmd = "SELECT specialtyRequiredToProcess,Description FROM caseTypes WHERE caseTypeName='{}'".format(inputCaseType)
c.execute(caseInputsql_cmd)
rows_returned = c.fetchall()
if not rows_returned:
print("No match to table of specimen types returned. Check the case type abbreviation and try again.")
return
else:
for row in rows_returned:
r = row[0]
d = row[1]
print("This is a", r, "service case. It is a", d,"type case.")
if r != "GENERALIST":
getPhysiciansql_cmd = "SELECT physicianName FROM physicianNamesSpecialties WHERE (specialty, specialty2, specialty3, specialty4) ='{}'".format(r)
c.execute(getPhysiciansql_cmd)
physResult = choice(c.fetchall())
cleanedUpPhysResult = physResult[0]
print("This case is going to", cleanedUpPhysResult+".")
insert(cleanedUpPhysResult)
select()
else:
distributefairly(inputCaseNumber, inputCaseType)
conn = mariaDB.connect(host='xxxxx', user='xxxxx',password='xxxxxx',db='lookupDB')
conn2 = mariaDB.connect(host='xxxxxxl', user='xxxxx',password='xxxxxxx',db='workQueue')
c = conn.cursor()
c2 = conn2.cursor()
count = 0
while True:
startup()
parse()
This line of code .execute(...) will fail:
getPhysiciansql_cmd =
"SELECT physicianName FROM physicianNamesSpecialties WHERE (specialty, specialty2, specialty3, specialty4) ='{}'".format(r)
c.execute(getPhysiciansql_cmd)
My Var r holds this data:
print("This is a", r, "service case. It is a", d,"type case.")
This is a THORACIC service case. It is a TRANSBRONCHIAL WANG NEEDLE ASPIRATION type case.
My Var getPhysiciansql_cmd holds this data:
getPhysiciansql_cmd = SELECT physicianName FROM
physicianNamesSpecialties WHERE (specialty, specialty2, specialty3, specialty4) ='THORACIC'
I think this is just a matter of how I'm using the WHERE clause. I saw somewhere that the WHERE with multiple fields is written, you use parens around the fields. I've experimented in reducing this to the bare minimum and using the OR operator in the WHERE clause and I have successfully received a proper response.
import mysql.connector as mariaDB
conn = mariaDB.connect(host='xxxxx', user='xxxx',password='xxxxx',db='lookupDB')
conn2 = mariaDB.connect(host='xxxxx', user='xxxx',password='xxxxxx',db='workQueue')
c = conn.cursor()
c2 = conn2.cursor()
inputSpecialty = input("specialty? ")
r = inputSpecialty
c.execute("SELECT physicianName FROM physicianNamesSpecialties WHERE specialty = %s OR specialty2 = %s OR specialty3 = %s OR specialty4 = %s", (r,r,r,r))
physResult = c.fetchall()
cleanedUpPhysResult = physResult
print(cleanedUpPhysResult)
Output:
specialty? THORACIC
[('Song',), ('Han',), ('He',), ('Goldfischer',)]