Search haveibeenpwned for all emails on a domain - python

I am able to use haveibeenpwned to search for 1 account compromise. However, I could not find an option to use the API key to search for compromise of all the email accounts on a domain. (For example. if the domain is xyz.com, I want to search for the compromise of abc#xyz.com, peter.charlie#xyz.com and so on). I am aware of the notification email that I can sign up for. But, that is a lengthy process and I prefer using the API.
So, I wrote a script to search against haveibeenpwned for all the email address of my domain, but it takes very long. I searched through a couple of Github projects, but I did not find any such implementation. Has anyone tried this before?
I have added the code below. I am using Multi threading approach, but still it takes very long, is there any other Optimization strategy I can use? Please help. Thank you.
import requests, json
import threading
from time import sleep
import datetime
import splunklib.client as client
import splunklib.results as results
date = datetime.datetime.now()
from itertools import islice
import linecache
import sys
def PrintException():
exc_type, exc_obj, tb = sys.exc_info()
f = tb.tb_frame
lineno = tb.tb_lineno
filename = f.f_code.co_filename
linecache.checkcache(filename)
line = linecache.getline(filename, lineno, f.f_globals)
print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)
class myThread (threading.Thread):
def __init__(self, threadID, name, list_emails):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.list_emails = list_emails
def run(self):
i=0
print "Starting " + self.name
for email in self.list_emails:
print i
i=i+1
result = check_pasteaccount(email)
print email
print result
print result
print "Exiting " + self.name
def check_pasteaccount(account):
account = str(account)
result = ""
URL = "https://haveibeenpwned.com/api/v3/pasteaccount/%s?truncateResponse=false" % (account)
# print(URL)
headers= {'hibp-api-key':api_key}
result = ""
try:
r = requests.get(url=URL,headers=headers)
# sleep(2)
status_code = r.status_code
if status_code == 200:
data = r.text
result = []
for entry in json.loads(data.decode('utf8')):
if int((date - datetime.datetime.strptime(entry['Date'], '%Y-%m-%dT%H:%M:%SZ')).days) > 120:
pass
else:
result.append(['Title: {0}'.format(entry['Title']), \
'Source: {0}'.format(['Source']), \
'Paste ID: {0}'.format(entry['Id'])])
if len(result) == 0:
result = "No paste reported for given account and time frame."
else:
paste_result = ""
for entry in result:
for item in entry:
paste_result += str(item) + "\r\n"
paste_result += "\r\n"
result = paste_result
elif status_code == 404:
result = "No paste for the account"
else:
if status_code == 429:
sleep(5)
# print "Limit exceeded, sleeping"
result = check_pasteaccount(account)
else:
result = "Exception"
print status_code
except Exception as e:
result = "Exception"
PrintException()
pass
return result
def split_every(n, iterable):
iterable = iter(iterable)
for chunk in iter(lambda: list(islice(iterable, n)), []):
yield chunk
def main():
print datetime.datetime.now()
# Fetching the list of email addresses from Splunk
list_emails = connect_splunk()
print datetime.datetime.now()
i=0
list_split = split_every(1000,list_emails)
threads=[]
for list in list_split:
i=i+1
thread_name = "Thread" + str(i)
thread = myThread(1, thread_name, list)
thread.start()
threads.append(thread)
# Wait for all the threads to complete
for t in threads:
t.join()
print "Completed Search"

Here's a shorter and maybe more efficient version of your script using the standard multiprocessing library instead of a hand-rolled thread system.
You'll need Python 3.6+ since we're using f-strings.
You'll need to install the tqdm module for fancy progress bars.
You can adjust the number of concurrent requests with the pool size parameter.
Output is written in machine-readable JSON Lines format into a timestamped file.
A single requests session is shared (per-worker), which means less time spent connecting to HIBP.
import datetime
import json
import multiprocessing
import random
import time
import requests
import tqdm
HIBP_PARAMS = {
"truncateResponse": "false",
}
HIBP_HEADERS = {
"hibp-api-key": "xxx",
}
sess = requests.Session()
def check_pasteaccount(account):
while True:
resp = sess.get(
url=f"https://haveibeenpwned.com/api/v3/pasteaccount/{account}",
params=HIBP_PARAMS,
headers=HIBP_HEADERS,
)
if resp.status_code == 429:
print("Quota exceeded, waiting for a while")
time.sleep(random.uniform(3, 7))
continue
if resp.status_code >= 400:
return {
"account": account,
"status": resp.status_code,
"result": resp.text,
}
return {
"account": account,
"status": resp.status_code,
"result": resp.json(),
}
def connect_splunk():
# TODO: return emails
return []
def main():
list_emails = [str(account) for account in connect_splunk()]
datestamp = datetime.datetime.now().isoformat().replace(":", "-")
output_filename = f"accounts-log-{datestamp}.jsonl"
print(f"Accounts to look up: {len(list_emails)}")
print(f"Output filename: {output_filename}")
with multiprocessing.Pool(processes=16) as p:
with open(output_filename, "a") as f:
results_iterable = p.imap_unordered(
check_pasteaccount, list_emails, chunksize=20
)
for result in tqdm.tqdm(
results_iterable,
total=len(list_emails),
unit="acc",
unit_scale=True,
):
print(json.dumps(result, sort_keys=True), file=f)
if __name__ == "__main__":
main()

Related

Best way to make thousands of get requests in python

Right now I am working on a python script which takes in a list of url's as an argument, then performs a GET request on each url and then searches through the output with xpath to fingerprint the website. It seems to work like a charm when the list is around 50 sites long, but anything after that causes the program to slow down to the point where it stop (usually around 150 sites). Scroll down to where you see main app logic and the relevant code it below. Right now I am just using 50 elements in the array and it works fine, but anything after makes the entire program stop. Any suggestions would be greatly appreciated!
#!/usr/bin/python
# Web Scraper
# 1.0
# Imports for file
from multiprocessing.dummy import Pool as ThreadPool
from threading import Thread
from Queue import Queue
from lxml import html
import requests
import time
import sys
# Get Raw HTML
def scrape(url):
try:
page = requests.get(url, timeout=2.0)
if page.status_code == requests.codes.ok:
html_page = html.fromstring(page.content)
s =requests.session()
s.close()
return html_page
else:
s =requests.session()
s.close()
return False
except:
s =requests.session()
s.close()
return False
# Format URL
def format_url(url):
if url.find("http://") == -1:
url = "http://"+url
if url[-1] == "/":
url = url[:-1]
return url
# Check if WordPress Site
def check_wordpress(tree):
scripts = tree.xpath("//script[contains(#src,'wp-content')]")
if len(scripts) > 0:
return True
return False
# Check WordPress Version
def wordpress_version(tree):
type = tree.xpath("//meta[#name='generator']/#content")
version = 0
if len(type) > 0:
details = type[0].split()
if len(details)>1 and details[0] == "WordPress":
if len(details) > 1:
version = details[1]
else:
version = type[0]
return version
# Find Contact Page
def find_contact_page(tree):
contact = tree.xpath("//a[contains(text(),'Contact')]/#href")
try_xpath = 1
while len(contact) == 0:
if try_xpath == 1:
contact = tree.xpath("//span[contains(text(),'Contact')]/../#href")
elif try_xpath == 2:
contact = tree.xpath("//p[contains(text(),'Contact')]/../#href")
elif try_xpath == 3:
break
try_xpath+=1
if len(contact) > 0:
contact = contact[0]
if contact.find('#') == -1:
if contact[0] == '/':
contact = url + "" + contact
print contact
# Juicer method
def juice(url):
url = format_url(url)
string = url
tree = scrape(url)
if tree == False:
return string + " \t\t\t No XML tree"
elif check_wordpress(tree) == True:
version = wordpress_version(tree)
return string + " \t\t\t WordPress: " + str(version)
else:
return string + " \t\t\t Not WordPress"
# Main App Logic Below ------------------------------------->
# Open list of websites from given argument
list = open(sys.argv[1],'r').read().split('\n')
# Juice url
def juice_url():
while True:
url = q.get()
result = juice(url)
print result
q.task_done()
# Create concurrent queues
concurrent = 50
q = Queue(concurrent)
for i in range(concurrent):
t = Thread(target=juice_url)
t.daemon = True
t.start()
# Add URL to Queue
time1 = time.time()
for url in list[0:50]:
q.put(url)
q.join()
# Calculate total time
total = time.time() - time1
print "Total Time: %f" % total
print "Average Time: %f" % (total/50)

Passing variables in python to another web platform

I have a code which requires to pass the latency, upspeed, dlspeed to another web site to display. Right now the code is as below
import datetime
import os
import sys
import shutil
import webbrowser
import tempfile
import subprocess
import json
import urllib.request
import statistics
import pymysql
import pymysql.cursors
IPERF3_WIN_PATH = "data/iperf3.exe"
HTML_TEMPLATE_PATH = "data/template.html"
IPERF3_HOST = "127.0.0.1"
RESULT_UPLOAD_URL = "UPLOAD URL"
RESULT_VIEW_URL = "VIEW URL"
def resource_path(relative_path):
""" Get absolute path to resource, works for dev and for PyInstaller
This is to get a path which will work with pyinstaller
"""
try:
# PyInstaller creates a temp folder and stores path in
# _MEIPASS
base_path = sys._MEIPASS
except Exception:
base_path = os.path.abspath(".")
return os.path.join(base_path, relative_path)
def ping(ip, tries):
""" Ping "ip" using the windows ping commmand
Return the average ping as a int
"""
res = 0
try:
output = subprocess.check_output(
["ping", "-n", str(tries), ip]).decode("utf-8")
res = int(output.split(" = ")[-1].split("ms")[0])
except subprocess.CalledProcessError:
input("Press Enter to Continue...")
sys.exit("Error while trying to ping the server, exiting")
else:
return res
def copyIperf3Exec():
""" On OSX :
Copy the iperf3 binary to a tmp file,
make it executable and return his path
This is to avoid many bundle related problems
On Windows, just return the package path """
return resource_path(IPERF3_WIN_PATH)
def get_iperf3_download():
""" Return the output of the iperf3 cli as a python dict """
ipf3_tmp = copyIperf3Exec()
try:
output = subprocess.check_output([ipf3_tmp,
"-c", IPERF3_HOST,
"-J",
"-P", "16",
"-w", "710000",
"-R"])
res_string = output.decode("utf-8")
except subprocess.CalledProcessError:
input("Press Enter to Continue...")
sys.exit("Problem while doing the test, please try again later")
else:
return json.loads(res_string)
def get_iperf3_upload():
""" Return the output of the iperf3 cli as a python dict """
ipf3_tmp = copyIperf3Exec()
try:
output = subprocess.check_output([ipf3_tmp,
"-c", IPERF3_HOST,
"-J",
"-P", "10",
"-w", "710000"])
res_string = output.decode("utf-8")
except subprocess.CalledProcessError:
input("Press Enter to Continue...")
sys.exit("Error while doing the upload test, please try again later")
else:
return json.loads(res_string)
def get_userinfos():
""" Get the 3 informations to be presented to the user
( ip, upload speed, download speed )
Return a Dictionary
"""
show_start_msg(0) # 0% Progress bar
avg_latency = ping(IPERF3_HOST, 5)
u_json = get_iperf3_upload()
show_start_msg(1) # 40%
d_json = get_iperf3_download()
show_start_msg(2) # 80%
ip = getip_apify()
u_bits_per_second = u_json['end']['sum_received']['bits_per_second']
d_bits_per_second = d_json['end']['sum_received']['bits_per_second']
u_testtime = u_json['end']['sum_received']['seconds']
d_testtime = d_json['end']['sum_received']['seconds']
u_testdate = u_json["start"]["timestamp"]["timesecs"]
d_testdate = d_json["start"]["timestamp"]["timesecs"]
res = {
'ip': ip,
'latency': avg_latency,
'upspeed': u_bits_per_second,
'dlspeed': d_bits_per_second,
'upspeedtime': u_testtime,
'dlspeedtime': d_testtime,
'upspeeddate': u_testdate,
'dlspeeddate': d_testdate
}
return res
def sendToDB(infos):
# Connect to the database
connection = pymysql.connect(host='127.0.0.1',
user='testclient',
password='password',
db='speed',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
# Create a new record
def stp_date(stp):
return datetime.datetime.fromtimestamp(stp).strftime(
'%Y-%m-%d %H:%M:%S')
sql = ("INSERT INTO `speedlog`"
"(`externalIP`, `uploadspeed`, `uploadspeedtime`,"
"`uploadspeeddate`, `downloadspeed`, `downloadspeedtime`,"
"`downloadspeeddate`, `latency`)"
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)")
cursor.execute(sql,
(infos["ip"],
str(int(infos["upspeed"])),
str("{0:.2f}".format(infos["upspeedtime"])),
stp_date(infos["upspeeddate"]),
str(int(infos["dlspeed"])),
str("{0:.2f}".format(infos["dlspeedtime"])),
stp_date(infos["dlspeeddate"]),
str(int(infos["latency"]))))
# connection is not autocommit by
# default. So you must commit to save
# your changes.
connection.commit()
finally:
connection.close()
return
def getip_apify():
res = urllib.request.urlopen("http://api.ipify.org")
raw_ip = res.read()
return raw_ip.decode('utf-8')
def prepare_template(templatePath, infos):
""" Load an html located at templatePath and replace the necessary text
with the associated values from the iPerf3 infos
Return a string
"""
f_template = open(templatePath)
s_template = f_template.read()
f_template.close()
mod_template = s_template.replace("avglatency", str(int(infos['latency'])))
mod_template = mod_template.replace(
"upspeed", str("{0:.3f}".format(infos['upspeed']/(1000*1000*1000))))
mod_template = mod_template.replace(
"dlspeed", str("{0:.3f}".format(infos['dlspeed']/(1000*1000*1000))))
return mod_template
def str_to_tempHtml(str):
""" Write "str" in an .html temporary file
And return his path
"""
data = bytes(str, "utf-8")
tmp = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
tmp.write(data)
tmp.flush()
return tmp.name
def show_start_msg(progress):
if sys.platform.startswith('darwin'):
unused = os.system('clear')
elif sys.platform.startswith('win32'):
unused = os.system('cls')
print("="*70)
print("Speed Testing for 10G Network \n")
print("Powered by iPerf3")
print("="*70)
if progress == -1:
input("Press Enter to Continue...\n")
return
else:
print("Press Enter to Continue...\n")
print("Testing in progress")
if progress == 0:
print("[" + " "*68 + "]" + " 0%")
elif progress == 1:
print("[" + "#" * 27 + " " * 41 + "]" + " 40%")
elif progress == 2:
print("[" + "#" * 54 + " " * 14 + "]" + " 80%")
elif progress == 3:
print("[" + "#"*68 + "]" + " 100%")
print("Completed")
if __name__ == '__main__':
show_start_msg(-1)
infos = get_userinfos()
sendToDB(infos)
show_start_msg(3) # 100% Complete
data = { "key":"Jasdkjfhsda349*lio34sdfFdslaPisdf",
"download":"2048000",
"upload":"2048000",
"latency":"10"}
req = urllib.request.Request(RESULT_UPLOAD_URL, json.dumps(data).encode(
'ascii'))
req.add_header('Content-Type', 'application/json')
resp = urllib.request.urlopen(req).read().decode('ascii')
resp = resp.replace('\'', '"')
webbrowser.open(RESULT_VIEW_URL.format(json.loads(resp)['test_id']))
input("Press Enter to Continue...")
My latency, upspeed and dlspeed variables are stored as infos, and later sent over to the DB for recording via sendtoDB(infos).
The next part is to also pass these sets of variables to another web using RESTful, which in the data, the first attribute "key" is the REST key for authentication, followed by the rest of the values like latency, downloadspeed and uploadspeed. However, you can see that in the data, all the 3 variables are hard-coded value instead of the values derived from the test, which is latency, upspeedand dlspeed.
How can I modify the code to get these attributes instead of the hardcoded ones?
You have a method that returns this dictionary...
res = {
'ip': ip,
'latency': avg_latency,
'upspeed': u_bits_per_second,
'dlspeed': d_bits_per_second,
'upspeedtime': u_testtime,
'dlspeedtime': d_testtime,
'upspeeddate': u_testdate,
'dlspeeddate': d_testdate
}
And it is called infos, so use it
data = { "key":"xxxxxxxx",
"download":infos['dlspeed']
"upload":infos['upspeed'],
"latency":infos['latency']}

How to write JSON from response to file? (PYTHON)

so im trying to write this JSON from the Kik smiley site, and im trying to do this so I wont have to write it manually, anyways I need to parse the JSON so only some of the existing JSON shows up in the file (basically cleaning it) what I need from the site is... (name, id, type) how would I do this?
I have written this in python but it seems to fail, and im not 100% sure as to why. I am new to Python, so sorry if this is an obvious question! I did find something earlier but it just confused me even more :) Thank you!
import requests, json, sys
from colorama import init
from termcolor import colored
#colorama
init()
class SmileyGrabber():
def __init__(self):
# requests vars
self.smileysFound = 0
self.smileysLost = 0
self.url="https://sticker-service.appspot.com/v2/collection/smiley"
self.session = requests.Session()
self.grabSmiley()
def grabSmiley(self):
while True:
try:
r = self.session.get(self.url)
j = r.json()
try:
if j["IsSuccess"] == True:
meta = j["smileys"]
sID = meta["id"]
sType = meta["type"]
sName = meta["name"]
FormatSmileyData(sID, sType, sName)
print "Smiley Found:", colored("({0})".format(sName), "cyan")
self.smileysFound += 1
else:
print(colored("Could not grab smiley"), "red")
self.smileysLost += 1
except:
sys.exit()
except KeyboardInterrupt:
sys.exit()
except:
print r.text
sys.exit()
class FormatSmileyData(object):
def __init__(self, sID, sType, sName):
smileyData = {}
data = []
data.append({"SMILEY_ID":sID, "SMILEY_TYPE":sType, "SMILEY_NAME":sName})
dataFile = open("smileys.json", "a+")
dataFile.write(json.dumps(smileyData)+"\n")
dataFile.close()
if __name__ == "__main__":
SmileyGrabber()
There are a number of problems with your code.
It will be more efficient to read from the network all at once
rather than making a call to session.get for each smiley.
j does not have an "IsSuccess" element, so that will never be true
j["smileys"] is a list, so to get the dictionaries (which represent each smiley) you will need to iterate through that list.
You are appending data into data but you are writing from
smileyData, which never has any data entered into it.
Each time you call the FormatSmileyData constructor, you are
resetting the data.
Take a look at a tool like Postman to prettify the JSON so you can see the structure. This can help figure out how to parse it.
Here's an updated version of your script that appears to work:
I removed the colorization and made it work with Python 3.
import requests, json, sys
class SmileyGrabber():
def __init__(self):
# requests vars
self.smileysFound = 0
self.smileysLost = 0
self.url="https://sticker-service.appspot.com/v2/collection/smiley"
self.session = requests.Session()
self.data = []
self.grabSmiley()
self.writeSmileyData()
def grabSmiley(self):
r = self.session.get(self.url)
j = r.json()
print ("got json")
print (str(len(j)))
for element in j:
for meta in element["smileys"]:
print ("---------------")
print (str(meta))
sID = meta["id"]
sType = meta["type"]
sName = meta["name"]
self.addSmileyData(sID, sType, sName)
print ("Smiley Found:" + "({0})".format(sName))
self.smileysFound += 1
print ("found " + str(self.smileysFound))
else:
print("Could not grab smiley")
self.smileysLost += 1
def addSmileyData(self, sID, sType, sName):
self.data.append({"SMILEY_ID":sID, "SMILEY_TYPE":sType, "SMILEY_NAME":sName})
def writeSmileyData(self):
dataFile = open("smileys.json", "a+")
dataFile.write(json.dumps(self.data)+"\n")
dataFile.close()
if __name__ == "__main__":
SmileyGrabber()

Parse JSON output in Python using Requests and its sessions

Here I have a rate stream that outputs the following and i'm looking to only print the "bid" price. Could someone help explain how I can parse the output correctly? It's driving me crazy!
example = 1.05653
I need the output without quotes or any other markup as well..
JSON
{
"tick": {
"instrument": "EUR_USD",
"time": "2015-04-13T14:28:26.123314Z",
"bid": 1.05653,
"ask": 1.05669
}
}
My code:
import requests
import json
from optparse import OptionParser
def connect_to_stream():
"""
Environment <Domain>
fxTrade stream-fxtrade.oanda.com
fxTrade Practice stream-fxpractice.oanda.com
sandbox stream-sandbox.oanda.com
"""
# Replace the following variables with your personal ones
domain = 'stream-fxpractice.oanda.com'
access_token = 'xxxxx'
account_id = 'xxxxxxxxx'
instruments = "EUR_USD"
try:
s = requests.Session()
url = "https://" + domain + "/v1/prices"
headers = {'Authorization' : 'Bearer ' + access_token,
# 'X-Accept-Datetime-Format' : 'unix'
}
params = {'instruments' : instruments, 'accountId' : account_id}
req = requests.Request('GET', url, headers = headers, params = params)
pre = req.prepare()
resp = s.send(pre, stream = True, verify = False)
return resp
except Exception as e:
s.close()
print "Caught exception when connecting to stream\n" + str(e)
def demo(displayHeartbeat):
response = connect_to_stream()
if response.status_code != 200:
print response.text
return
for line in response.iter_lines(1):
if line:
try:
msg = json.loads(line)
except Exception as e:
print "Caught exception when converting message into json\n" + str(e)
return
if msg.has_key("instrument") or msg.has_key("tick"):
print line
if displayHeartbeat:
print line
else:
if msg.has_key("instrument") or msg.has_key("tick"):
print line
def main():
usage = "usage: %prog [options]"
parser = OptionParser(usage)
parser.add_option("-b", "--displayHeartBeat", dest = "verbose", action = "store_true",
help = "Display HeartBeat in streaming data")
displayHeartbeat = False
(options, args) = parser.parse_args()
if len(args) > 1:
parser.error("incorrect number of arguments")
if options.verbose:
displayHeartbeat = True
demo(displayHeartbeat)
if __name__ == "__main__":
main()
Sorry if this is an extremely basic question but I'm not that familiar with python..
Thanks in advance!
You are iterating over the stream line by line attempting to parse each line as JSON. Each line alone is not proper JSON so that's one problem.
I would just regex over each hline you bring in looking for the text "bid: " followed by a decimal number, and return that number as a float. For example:
import re
for line in response.iter_lines(1):
matches = re.findall(r'\"bid\"\:\s(\d*\.\d*)', line)
if len(matches) > 0:
print float(matches[0])
Try something along the lines of this:
def demo(displayHeartbeat):
response = connect_to_stream()
for line in response.iter_lines():
if line.startswith(" \"bid\"")
print "bid:"+line.split(":")[1]
This actually turned out to be pretty easy, I fixed it by replacing the "demo" function with this:
def demo(displayHeartbeat):
response = connect_to_stream()
if response.status_code != 200:
print response.text
return
for line in response.iter_lines(1):
if line:
try:
msg = json.loads(line)
except Exception as e:
print "Caught exception when converting message into json\n" + str(e)
return
if displayHeartbeat:
print line
else:
if msg.has_key("instrument") or msg.has_key("tick"):
print msg["tick"]["ask"] - .001
instrument = msg["tick"]["instrument"]
time = msg["tick"]["time"]
bid = msg["tick"]["bid"]
ask = msg["tick"]["ask"]

Python: multiple files download by turn

In script loop performs files downloading and saving (curl). But loop iterations too quick, so downloading and saving actions have no time to complete it's operations. Thereat result files comes broken
def get_images_thread(table):
class LoopThread ( threading.Thread ):
def run ( self ):
global db
c=db.cursor()
c.execute(""" SELECT * FROM js_stones ORDER BY stone_id LIMIT 1
""")
ec = EasyCurl(table)
while(1):
stone = c.fetchone()
if stone == None:
break
img_fname = stone[2]
print img_fname
url = "http://www.jstone.it/"+img_fname
fname = url.strip("/").split("/")[-1].strip()
ec.perform(url, filename="D:\\Var\\Python\\Jstone\\downloadeble_pictures\\"+fname,
progress=ec.textprogress)
This is an excerpt from the examples for the PycURL library,
# Make a queue with (url, filename) tuples
queue = Queue.Queue()
for url in urls:
url = url.strip()
if not url or url[0] == "#":
continue
filename = "doc_%03d.dat" % (len(queue.queue) + 1)
queue.put((url, filename))
# Check args
assert queue.queue, "no URLs given"
num_urls = len(queue.queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
class WorkerThread(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while 1:
try:
url, filename = self.queue.get_nowait()
except Queue.Empty:
raise SystemExit
fp = open(filename, "wb")
curl = pycurl.Curl()
curl.setopt(pycurl.URL, url)
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, 5)
curl.setopt(pycurl.CONNECTTIMEOUT, 30)
curl.setopt(pycurl.TIMEOUT, 300)
curl.setopt(pycurl.NOSIGNAL, 1)
curl.setopt(pycurl.WRITEDATA, fp)
try:
curl.perform()
except:
import traceback
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
curl.close()
fp.close()
sys.stdout.write(".")
sys.stdout.flush()
# Start a bunch of threads
threads = []
for dummy in range(num_conn):
t = WorkerThread(queue)
t.start()
threads.append(t)
# Wait for all threads to finish
for thread in threads:
thread.join()
If you're asking what I think you're asking,
from time import sleep
sleep(1)
should "solve"(It's hacky to the max!) your problem. Docs here. I would check that that really is your problem, though. It seems catastrophically unlikely that pausing for a few seconds would stop files from downloading brokenly. Some more detail would be nice too.
os.waitpid()
might also help.

Categories