Multiprocessing.dummy, multiprocessing, and map, how to perform error handling? - python

I'm using the multiprocessing.dummy module to do some concurrent processing. I'm making HTTP requests, and there is a possibility that the object will not have any data returned. In this case I need to capture the AttributeError and move on.
I tried capturing it in the object itself, and still received the error, the only thing that worked was a try/except on the pool.map call itself. I'm wondering why this is, and if this is the best way to do error handling for multiprocessing and map functions?
Here is some of my code for reference:
all_commits = []
projects = [Project(value['id']) for value in project_data.values()]
def process_projects(project):
if project.name in bad_names.keys():
project.name = bad_names[project.name]
project.return_results(rest, all_commits)
pool = ThreadPool(8)
pool.map(process_projects, projects)
pool.close()
pool.join()
print 'All data gathered.'
print 'Number of commits: {}'.format(len(all_commits))
fieldnames = get_fieldnames(
'ods_gerrit.staging_gerrit_commits',
settings.REDSHIFT_POSTGRES_INFO)
s3_file = ('staging_gerrit_commits_{}.csv.gz'.format(
date.today())
)
with gzip.open(s3_file, 'wb') as outf:
writer = DictWriter(
outf,
fieldnames=fieldnames,
extrasaction='ignore',
delimiter='|'
)
cnt = 0
pool = ThreadPool(8)
try:
pool.map(process_commits, all_commits)
except AttributeError:
pass
pool.close()
pool.join()
Then here is my Commit object code and the function that is being called by the map function:
class Commit(object):
def __init__(self, rev_id, change_id, full_id):
self.rev_id = rev_id
self.change_id = change_id
self.full_id = full_id
def clean_data(self, _dict):
for key, value in _dict.items():
if isinstance(value, dict):
self.clean_data(_dict[key])
else:
try:
_dict[key] = _dict[key].encode(
'utf_8',
'replace'
).encode('string_escape').replace('|', '[pipe]')
except AttributeError:
continue
def get_data(self, ger_obj):
print 'Getting data for a commit for {f_id}'.format(
f_id=self.full_id
)
endpoint = (r'/changes/{c_id}/revisions/{r_id}/commit'.format(
c_id=self.change_id,
r_id=self.rev_id
))
try:
self.data = ger_obj.get(endpoint)
except HTTPError:
try:
endpoint = (r'/changes/{f_id}/revisions/{r_id}/commit'.format(
f_id=self.full_id,
r_id=self.rev_id
))
self.data = ger_obj.get(endpoint)
except HTTPError:
logging.warning('Neither endpoint returned data: {ep}'.format(
ep=endpoint
))
raise HTTPError()
except ReadTimeout:
logging.warning('Read Timeout occurred for a commit. Endpoint: '
'{ep/}'.format(ep=endpoint))
return
self.data['change_id'] = self.change_id
self.data['proj_branch_id'] = self.full_id
self.data['revision_id'] = self.rev_id
self.data['commitid'] = self.data.get('commit')
self.data['name'] = self.data.get('committer')['name']
self.data['email'] = self.data.get('committer')['email']
self.data['date'] = self.data.get('committer')['date']
hash = md5()
hash.update(json.dumps(self.data).encode('utf-8'))
self.data['etl_checksum_md5'] = hash.hexdigest()
self.data['etl_process_status'] = settings.ETL_PROCESS_STATUS
self.data['etl_datetime_local'] = settings.ETL_DATETIME_LOCAL
self.data['etl_pdi_version'] = settings.ETL_PDI_VERSION
self.data['etl_pdi_build_version'] = settings.ETL_PDI_BUILD_VERSION
self.data['etl_pdi_hostname'] = settings.ETL_PDI_HOSTNAME
self.data['etl_pdi_ipaddress'] = settings.ETL_PDI_IPADDRESS
self.clean_data(self.data)
def write_data(self, writer):
print 'Writing a commit for {f_id}'.format(f_id=self.full_id)
writer.writerow(self.data)
And the controller function:
def process_commits(commit):
print 'On commit #{}'.format(cnt)
unique_id = commit.change_id + commit.rev_id
if not id_search(unique_ids, unique_id):
try:
commit.get_data(rest)
except HTTPError:
pass
try:
commit.write_data(writer=writer)
except UnicodeEncodeError:
logging.warning(
'{data} caused a Unicode Encode Error.'.format(
data=commit.data
))
pass
global cnt
cnt += 1

Related

Problem with POST JSON, i got not whole json after receiving

I have a webparser on the "socket" library (S03) and have a module for parse this dict for unzip needle information(before_json)
code S03
# Init .env
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)
#init
secret_token = os.environ['secret_token']
#time
seconds = time.time()
local_time = time.ctime(seconds)
#HDRS
HDRS = 'HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\n\r\n'
HDRS_404 = 'HTTP/1.1 404 OK\r\nContent-Type: text/html; charset=utf-8\r\n\r\n'
# create webserver socket
socket_server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
socket_server.bind(('ip', 8888))
socket_server.listen(356)
socket_server.settimeout(5)
#Start procesing incoming json
def start_my_server():
#def of compare secret key
def load_secret(secret_token, data):
try:
# slack_message_pipe(step=f'I LOAD JSON')
print('load_secret')
key = str(data)
key = re.findall(f'X-Gitlab-Token:...............', key)
print(key)
key = str(key).replace("['X-Gitlab-Token: ", '')
key = str(key).replace("']", '')
print(key)
print(secret_token, ' !!! ', key)
if secret_token == key:
socket_server.settimeout(None)
try_to_verification(key)
else:
fail_verifivcation()
except Exception as e:
print(e)
return
# slack_message_pipe(step=f'start_my_server.load_secret {e}')
def try_to_verification(key):
try:
print(key, 'key try_to_verification')
client_socket.send(HDRS.encode('utf-8'))
client_socket.shutdown(socket.SHUT_WR)
# with open(f"path to file('{local_time}').json", 'w+') as output_file:
# json.dump(data, output_file)
with open(f"path to file", 'w+') as file:
json.dump(data, file)
file.close()
print('next step')
json_dump_for_proj(data)
except Exception as e:
print(e)
return
# slack_message_pipe(step=f'start_my_server.try_to_verification {e}')
def fail_verifivcation():
try:
print('Not find')
client_socket.send(HDRS_404.encode('utf-8'))
client_socket.shutdown(socket.SHUT_WR)
addresses = open('ipPOST', 'a')
addresses.write(str(address) + f'{local_time}\n')
addresses.close()
except Exception as e:
print(e)
return
# slack_message_pipe(step=f'start_my_server.fail_verifivcation {e}')
while True:
print('start loop')
try:
print('try loop')
while True:
print('Working...')
client_socket, address = socket_server.accept()
print('loop', address)
data = client_socket.recv(1048576).decode('utf-8')
# slack_message_pipe(step=f'I GOT JSON')
load_secret(secret_token, data)
except Exception as e:
# slack_message_pipe(step=f'start_my_server.socket.error {e}')
print(f'pass try {e}')
fail_verifivcation()
code before_json
home_path = os.environ['home']
# time
seconds = time.time()
local_time = time.ctime(seconds)
def json_dump_for_proj(data):
os.chdir(home_path)
try:
data = str(data).replace('null', '0')
# Find head json
data = re.sub('POST / HTTP/1.1\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
# data = re.sub(',total_commits_count.*', '}', data)
data = re.sub('POST / HTTP/1.1\r\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
# data = re.sub('.total_commits_count.*', '}', data)
data = re.sub('POST.*\r\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
data = re.sub('"POST / HTTP/1.1\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
data = re.sub('"POST / HTTP/1.1\r\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
data = re.sub('"POST.*\r\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
data = json.loads(data)
# parse needly info
# Branch
# print(data['ref'])
branch = data['ref']
# print(data['commits'])
for keys in data['commits']:
# id
id_hash = keys['id']
# author
# name
name = keys['author']['name']
# email
email = keys['author']['email']
# files
# added
added = keys['added']
for _ in range(len(added) + 1):
for j in added:
if 'path to file' not in j:
added.remove(j)
# print('path to file' not in added[-1])
# modif
modified = keys['modified']
for _ in range(len(modified) + 1):
for k in modified:
if '' not in k:
print(k)
modified.remove(k)
print(id_hash, name, email, branch, modified, sep='\n' + '*' * 100 + '\n')
list_of = [(name, email), added, modified, id_hash]
# write_list(list_of)
# print(not modified and not added)
message_dict = {"name": name, "email": email, "modified": modified, "added": added}
if not modified and not added:
slack_message_pipe_good(
step=f' \nI got commit by {message_dict.get("name")}\nEmail: {message_dict.get("email")}\n\nBut it is empty, pass')
return
try:
# slack_message_pipe_good(step=f' \nI got commit by {message_dict.get("name")}\nEmail: {message_dict.get("email")}\n\nInside this commit, the following changes\nadded:{message_dict.get("added")}\nmodified:{message_dict.get("modified")}\n\n I am going to the next step')
git_checkout(id_hash, message_dict)
except Exception as e:
slack_message_pipe(step=f'ERROS ON STEP before_deploy_Parse_Json.json_dump_for_proj: {e}')
return
except Exception as e:
with open(f'{local_time}.json',
'w+') as data_failure:
data_failure.write(data)
data_failure.close()
slack_message_pipe(step=f' before_deploy_Parse_Json.json_dump_for_proj {e}')
def write_list(list_of):
try:
with open(f'path to file', 'w+') as output_file:
output_file.write(str(list_of) + '\n')
output_file.close()
print('I all write')
except Exception as e:
slack_message_pipe(step=f' before_deploy_Parse_Json.write_list {e}')
# with open('data.json', 'r') as json_file:
# data = json.load(json_file)
# json_dump_for_proj(data)
if __name__ == '__main__':
print('Parse_Json')
problem:
I got whole json from S03 and after i begining parse him in before_json, but not ussualy it whole, some king of json gone, size of gone block separated\

argument for 's' must be a bytes object in Python 3.8

Why am I getting the error argument for 's' must be a bytes object when trying to run the lambda function? I'm following the usage example but I'm getting this error. Any explanation to this issue and how to resolve it?
{
"errorMessage": "Failed sending data.\nERROR: argument for 's' must be a bytes object",
"errorType": "Exception",
"stackTrace": [
" File \"/var/task/AlertMetricSender.py\", line 5, in lambda_handler\n sender.send()\n",
" File \"/var/task/modules/ZabbixSender.py\", line 91, in send\n self.__active_checks()\n",
" File \"/var/task/modules/ZabbixSender.py\", line 79, in __active_checks\n response = self.__request(request)\n",
" File \"/var/task/modules/ZabbixSender.py\", line 59, in __request\n raise Exception(\"Failed sending data.\\nERROR: %s\" % e)\n"
]
}
ZabbixSender.py:
#
# For sending metric value to zabbix server.
#
# You must create item as "zabbix trapper" on server.
# Because the server must be connected to agent:10050, if it is selected "zabbix agent".
#
# Usage:
# from modules.ZabbixSender import ZabbixSender
# ZABBIX_HOST = "zabbix.example.com"
# ZABBIX_PORT = 10051
# sender = ZabbixSender(ZABBIX_HOST, ZABBIX_PORT)
# sender.add("example-hostname-01", "healthcheck", 1)
# sender.add("example-hostname-01", "item.keyname", 0.123)
# sender.add("example-hostname-02", "item.keyname", 1234)
# sender.send()
#
import socket
import struct
import time
import json
class ZabbixSender:
log = True
def __init__(self, host='127.0.0.1', port=10051):
self.address = (host, port)
self.data = []
def __log(self, log):
if self.log: print(log)
def __connect(self):
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
self.sock.connect(self.address)
except:
raise Exception("Can't connect server.")
def __close(self):
self.sock.close()
def __pack(self, request):
string = json.dumps(request)
header = struct.pack('<4sBQ', 'ZBXD', 1, len(string))
return header + string
def __unpack(self, response):
header, version, length = struct.unpack('<4sBQ', response[:13])
(data, ) = struct.unpack('<%ds'%length, response[13:13+length])
return json.loads(data)
def __request(self, request):
self.__connect()
try:
self.sock.sendall(self.__pack(request))
except Exception as e:
raise Exception("Failed sending data.\nERROR: %s" % e)
response = ''
while True:
data = self.sock.recv(4096)
if not data:
break
response += data
self.__close()
return self.__unpack(response)
def __active_checks(self):
hosts = set()
for d in self.data:
hosts.add(d['host'])
for h in hosts:
request = {"request":"active checks", "host":h}
self.__log("[active check] %s" % h)
response = self.__request(request)
if not response['response'] == 'success': self.__log("[host not found] %s" % h)
def add(self, host, key, value, clock=None):
if clock is None: clock = int(time.time())
self.data.append({"host":host, "key":key, "value":value, "clock":clock})
def send(self):
if not self.data:
self.__log("Not found sender data, end without sending.")
return False
self.__active_checks()
request = {"request":"sender data", "data":self.data}
response = self.__request(request)
result = True if response['response'] == 'success' else False
if result:
for d in self.data:
self.__log("[send data] %s" % d)
self.__log("[send result] %s" % response['info'])
else:
raise Exception("Failed send data.")
return result
if __name__ == '__main__':
sender = ZabbixSender()
sender.add("gedowfather-example-01", "healthcheck", 1)
sender.add("gedowfather-example-01", "gedow.item", 1111)
sender.send()
AlertMetricSender.py:
from modules.ZabbixSender import ZabbixSender
def lambda_handler(event, context):
sender = ZabbixSender("10.10.10.10", 10051)
sender.add("Zabbix server", "lambda.test", 5)
sender.send()
The error is coming from struct.pack. You're not seeing that, because of your blanket try/except.
All socket activity is done in byte strings, not Unicode strings. You need this:
def __pack(self, request):
string = json.dumps(request).encode('utf-8')
header = b'ZBXD' + struct.pack('<BQ', 1, len(string))
return header + string
One subtle thing about this: You must convert to a bytes string BEFORE you do len(string). It's quite possible for the conversion to UTF-8 to increase the number of characters in the string.
AND I absolutely need to comment on this:
result = True if response['response'] == 'success' else False
What led you to write that? This is exactly the same as the much more natural reading:
result = response['response'] == 'success'

Value not appending to global array

I am trying to run a multithreaded email checker to see if the emails are office 365 valid.
Looking over and over my code, I cannot seem to find the reason it's not working correctly.
It should be appending the email to a GOOD or BAD list.
Instead, it's not appending anything!
This is my code:
...
currentDirectory = os.getcwd() # set the current directory - /new/
# Locations
location_emails_goods = currentDirectory + '/contacts/goods/'
location_emails_bads = currentDirectory + '/contacts/bads/'
location_emails = currentDirectory + '/contacts/contacts.txt'
now = datetime.now()
todayString = now.strftime('%d-%m-%Y-%H-%M-%S')
FILE_NAME_DATE_GOODS = None
FILE_NAME_DATE_BADS = None
ALL_EMAILS = get_contacts(location_emails)
url = 'https://login.microsoftonline.com/common/GetCredentialType'
# Get all emails
def get_contacts(filename):
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
emails.append(a_contact.strip())
return emails
def saveLogs():
global GOOD_EMAILS_ARRAY, BAD_EMAILS_ARRAY, file_bads, file_goods, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS
#print(GOOD_EMAILS_ARRAY)
for good in GOOD_EMAILS_ARRAY:
file_goods.write(good + '\n')
file_goods.close()
for bad in BAD_EMAILS_ARRAY:
file_bads.write(bad + '\n')
file_bads.close()
def newChecker(email):
global url, GOOD_EMAILS_ARRAY, BAD_EMAILS_ARRAY
s = req.session()
body = '{"Username":"%s"}' % email
request = req.post(url, data=body)
response = request.text
valid = re.search('"IfExistsResult":0,', response)
invalid = re.search('"IfExistsResult":1,', response)
if invalid:
BAD_EMAILS_ARRAY.append(email)
if valid:
GOOD_EMAILS_ARRAY.append(email)
else:
if valid:
GOOD_EMAILS_ARRAY.append(email)
else:
BAD_EMAILS_ARRAY.append(email)
# The follow is showing empty array eventhough I have defined GOOD_EMAILS_ARRAY globally so it should be updating
print(GOOD_EMAILS_ARRAY)
def mp_handler(p):
global ALL_EMAILS
p.map(newChecker, ALL_EMAILS)
if __name__ == '__main__':
# Foreach email, parse it into our checker
# Define a filename to save to
FILE_NAME_DATE_GOODS = '{}{}{}'.format(location_emails_goods, todayString, '.txt')
FILE_NAME_DATE_BADS = '{}{}{}'.format(location_emails_bads, todayString, '.txt')
file_bads = open(FILE_NAME_DATE_BADS, 'a')
file_goods = open(FILE_NAME_DATE_GOODS, 'a')
p = multiprocessing.Pool(500)
mp_handler(p)
saveLogs()
p.close()
As you can see, I am trying to append an email to either GOOD_EMAILS_ARRAY or BAD_EMAILS_ARRAY.
The BAD_EMAILS_ARRAY and GOOD_EMAILS_ARRAY are global variables but it for reason won't append to them.
I am running this through multiprocessing if you need to know.
Any ideas or errors looking in my code?
Processes do not share memory, the global variable with same name in two processes are two different object.
If you need share state between processes, see this:
https://docs.python.org/3/library/multiprocessing.html#sharing-state-between-processes
Okay so it turns out that I just needed to use the Manager from multiprocessing:
from multiprocessing import Manager, Pool
then I could use a normal array through the manager such as:
# Set empty arrays using manager so we can carry it over
manager = Manager()
bad_list = manager.list()
good_list = manager.list()
This allowed me to then use my script like it was, just using these new arrays by Manager which works just how I wanted :)
...
FILE_NAME_DATE_GOODS = None
FILE_NAME_DATE_BADS = None
# Set empty arrays using manager so we can carry it over
manager = Manager()
bad_list = manager.list()
good_list = manager.list()
# Get all emails
def get_contacts(filename):
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
emails.append(a_contact.strip())
return emails
ALL_EMAILS = get_contacts(location_emails)
url = 'https://login.microsoftonline.com/common/GetCredentialType'
def saveLogs():
global file_bads, file_goods, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS, good_list, bad_list
for good in good_list:
file_goods.write(good + '\n')
file_goods.close()
for bad in bad_list:
file_bads.write(bad + '\n')
file_bads.close()
print('{} => Fully completed email scanning'.format(Fore.CYAN))
print('{} => Good emails [{}] || Bad emails [{}]'.format(Fore.GREEN, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS))
def newChecker(email):
global url, good_list, bad_list
s = req.session()
body = '{"Username":"%s"}' % email
request = req.post(url, data=body)
response = request.text
valid = re.search('"IfExistsResult":0,', response)
invalid = re.search('"IfExistsResult":1,', response)
if invalid:
bad_list.append(email)
if valid:
good_list.append(email)
else:
if valid:
good_list.append(email)
else:
bad_list.append(email)
def mp_handler(p):
global ALL_EMAILS
p.map(newChecker, ALL_EMAILS)
if __name__ == '__main__':
# Foreach email, parse it into our checker
# Define a filename to save to
FILE_NAME_DATE_GOODS = '{}{}{}'.format(location_emails_goods, todayString, '.txt')
FILE_NAME_DATE_BADS = '{}{}{}'.format(location_emails_bads, todayString, '.txt')
file_bads = open(FILE_NAME_DATE_BADS, 'a')
file_goods = open(FILE_NAME_DATE_GOODS, 'a')
p = multiprocessing.Pool(500)
mp_handler(p)
saveLogs()
p.close()

Search haveibeenpwned for all emails on a domain

I am able to use haveibeenpwned to search for 1 account compromise. However, I could not find an option to use the API key to search for compromise of all the email accounts on a domain. (For example. if the domain is xyz.com, I want to search for the compromise of abc#xyz.com, peter.charlie#xyz.com and so on). I am aware of the notification email that I can sign up for. But, that is a lengthy process and I prefer using the API.
So, I wrote a script to search against haveibeenpwned for all the email address of my domain, but it takes very long. I searched through a couple of Github projects, but I did not find any such implementation. Has anyone tried this before?
I have added the code below. I am using Multi threading approach, but still it takes very long, is there any other Optimization strategy I can use? Please help. Thank you.
import requests, json
import threading
from time import sleep
import datetime
import splunklib.client as client
import splunklib.results as results
date = datetime.datetime.now()
from itertools import islice
import linecache
import sys
def PrintException():
exc_type, exc_obj, tb = sys.exc_info()
f = tb.tb_frame
lineno = tb.tb_lineno
filename = f.f_code.co_filename
linecache.checkcache(filename)
line = linecache.getline(filename, lineno, f.f_globals)
print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)
class myThread (threading.Thread):
def __init__(self, threadID, name, list_emails):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.list_emails = list_emails
def run(self):
i=0
print "Starting " + self.name
for email in self.list_emails:
print i
i=i+1
result = check_pasteaccount(email)
print email
print result
print result
print "Exiting " + self.name
def check_pasteaccount(account):
account = str(account)
result = ""
URL = "https://haveibeenpwned.com/api/v3/pasteaccount/%s?truncateResponse=false" % (account)
# print(URL)
headers= {'hibp-api-key':api_key}
result = ""
try:
r = requests.get(url=URL,headers=headers)
# sleep(2)
status_code = r.status_code
if status_code == 200:
data = r.text
result = []
for entry in json.loads(data.decode('utf8')):
if int((date - datetime.datetime.strptime(entry['Date'], '%Y-%m-%dT%H:%M:%SZ')).days) > 120:
pass
else:
result.append(['Title: {0}'.format(entry['Title']), \
'Source: {0}'.format(['Source']), \
'Paste ID: {0}'.format(entry['Id'])])
if len(result) == 0:
result = "No paste reported for given account and time frame."
else:
paste_result = ""
for entry in result:
for item in entry:
paste_result += str(item) + "\r\n"
paste_result += "\r\n"
result = paste_result
elif status_code == 404:
result = "No paste for the account"
else:
if status_code == 429:
sleep(5)
# print "Limit exceeded, sleeping"
result = check_pasteaccount(account)
else:
result = "Exception"
print status_code
except Exception as e:
result = "Exception"
PrintException()
pass
return result
def split_every(n, iterable):
iterable = iter(iterable)
for chunk in iter(lambda: list(islice(iterable, n)), []):
yield chunk
def main():
print datetime.datetime.now()
# Fetching the list of email addresses from Splunk
list_emails = connect_splunk()
print datetime.datetime.now()
i=0
list_split = split_every(1000,list_emails)
threads=[]
for list in list_split:
i=i+1
thread_name = "Thread" + str(i)
thread = myThread(1, thread_name, list)
thread.start()
threads.append(thread)
# Wait for all the threads to complete
for t in threads:
t.join()
print "Completed Search"
Here's a shorter and maybe more efficient version of your script using the standard multiprocessing library instead of a hand-rolled thread system.
You'll need Python 3.6+ since we're using f-strings.
You'll need to install the tqdm module for fancy progress bars.
You can adjust the number of concurrent requests with the pool size parameter.
Output is written in machine-readable JSON Lines format into a timestamped file.
A single requests session is shared (per-worker), which means less time spent connecting to HIBP.
import datetime
import json
import multiprocessing
import random
import time
import requests
import tqdm
HIBP_PARAMS = {
"truncateResponse": "false",
}
HIBP_HEADERS = {
"hibp-api-key": "xxx",
}
sess = requests.Session()
def check_pasteaccount(account):
while True:
resp = sess.get(
url=f"https://haveibeenpwned.com/api/v3/pasteaccount/{account}",
params=HIBP_PARAMS,
headers=HIBP_HEADERS,
)
if resp.status_code == 429:
print("Quota exceeded, waiting for a while")
time.sleep(random.uniform(3, 7))
continue
if resp.status_code >= 400:
return {
"account": account,
"status": resp.status_code,
"result": resp.text,
}
return {
"account": account,
"status": resp.status_code,
"result": resp.json(),
}
def connect_splunk():
# TODO: return emails
return []
def main():
list_emails = [str(account) for account in connect_splunk()]
datestamp = datetime.datetime.now().isoformat().replace(":", "-")
output_filename = f"accounts-log-{datestamp}.jsonl"
print(f"Accounts to look up: {len(list_emails)}")
print(f"Output filename: {output_filename}")
with multiprocessing.Pool(processes=16) as p:
with open(output_filename, "a") as f:
results_iterable = p.imap_unordered(
check_pasteaccount, list_emails, chunksize=20
)
for result in tqdm.tqdm(
results_iterable,
total=len(list_emails),
unit="acc",
unit_scale=True,
):
print(json.dumps(result, sort_keys=True), file=f)
if __name__ == "__main__":
main()

How to handle exception, so the script continues to work?

There is a script which makes API requests through iterating of a params dictionary.
If params are not compatible between each other (metrics and dimensions) or there is a mistake, it throws an exception:
googleapiclient.errors.HttpError: "Could not parse content (N/A) of field parameters.filters.">
And the script stops working.
It looks like this
def yt_return_api_response(yt_params):
responses = []
timestamp = []
try:
youtubeAnalytics = get_service()
for k, v in yt_params.items():
request = execute_api_request(
youtubeAnalytics.reports().query,
ids=v['ids'],
startDate=v['startDate'],
endDate=v['endDate'],
metrics=v['metrics'],
dimensions=v['dimensions'],
filters=v['filters'],
maxResults=v['maxResults'],
sort=v['sort'])
response = youtube_response(request)
responses.append(response)
# get the timestamp
timestamp_request = dt.datetime.now()
timestamp_request = timestamp_request.strftime('%Y-%m-%d %H:%M:%S.%f')
timestamp.append(timestamp_request)
return responses, timestamp
except Exception as e:
logging.error('Check the request params, unsupported query', exc_info=True)
I've tried to change it, in order after one iteration if there is a mistake it would not crash but continues to work.
With 'while True' it starts and just keeps working without any result.
def yt_return_api_response(yt_params, request_ids, filters):
responses = []
timestamp = []
while True:
try:
with 'finally' returns empty lists
def yt_return_api_response(yt_params):
responses = []
timestamp = []
try:
youtubeAnalytics = get_service()
for k, v in yt_params.items():
request = execute_api_request(
youtubeAnalytics.reports().query,
ids=v['ids'],
startDate=v['startDate'],
endDate=v['endDate'],
metrics=v['metrics'],
dimensions=v['dimensions'],
filters=v['filters'],
maxResults=v['maxResults'],
sort=v['sort'])
response = youtube_response(request)
responses.append(response)
# get the timestamp
timestamp_request = dt.datetime.now()
timestamp_request = timestamp_request.strftime('%Y-%m-%d %H:%M:%S.%f')
timestamp.append(timestamp_request)
except Exception as e:
logging.error('Check the request params, unsupported query', exc_info=True)
finally:
return responses, timestamp
Is there other way to handle exceptions?
You need to skip one itteration, your code when catch exception go out from loop. You can try to skip one iteration like that:
def yt_return_api_response(yt_params):
responses = []
timestamp = []
youtubeAnalytics = get_service()
for k, v in yt_params.items():
try:
request = execute_api_request(
youtubeAnalytics.reports().query,
ids=v['ids'],
startDate=v['startDate'],
endDate=v['endDate'],
metrics=v['metrics'],
dimensions=v['dimensions'],
filters=v['filters'],
maxResults=v['maxResults'],
sort=v['sort'])
response = youtube_response(request)
responses.append(response)
# get the timestamp
timestamp_request = dt.datetime.now()
timestamp_request = timestamp_request.strftime('%Y-%m-%d %H:%M:%S.%f')
timestamp.append(timestamp_request)
except Exception as e:
logging.error('Check the request params, unsupported query', exc_info=True)
return responses, timestamp

Categories