I used this script
from twisted.internet import reactor, threads
from urlparse import urlparse
import httplib
import itertools
concurrent = 200
finished=itertools.count(1)
reactor.suggestThreadPoolSize(concurrent)
def getStatus(ourl):
url = urlparse(ourl)
conn = httplib.HTTPConnection(url.netloc)
conn.request("HEAD", url.path)
res = conn.getresponse()
return res.status
def processResponse(response,url):
print response, url
processedOne()
def processError(error,url):
print "error", url#, error
processedOne()
def processedOne():
if finished.next()==added:
reactor.stop()
def addTask(url):
req = threads.deferToThread(getStatus, url)
req.addCallback(processResponse, url)
req.addErrback(processError, url)
added=0
for url in open('urllist.txt'):
added+=1
addTask(url.strip())
try:
reactor.run()
except KeyboardInterrupt:
reactor.stop()
when i try to run the script $ python test.py
it just print the url not do cUrl or send HTTP request ..
how could I send the HTTP or cURL process for each one
Thanks
This should work if if the format of your urls does not contain 'http://' However,
If they do contain 'http://' there is a solution for that in the comments
import httplib
def requester(url):
host = url.split('/')[0]
#if urls do contain 'http://' --> host = url.split('/')[2].replace('http://','')
req = url[url.find(host)+len(host):]
conn = httplib.HTTPConnection(host)
conn.request("HEAD","/"+req)
response = conn.getresponse()
print response.status, response.reason
#if you want data...
#data = response.read()
#print data
for url in open(urls.txt):
try:
requester(url)
except Error,e:
print Error, e
Furthermore, I reccomend checking out the httplib
Tested code, using inlineCallbacks and deferToThread. Also using defer.gatherResults to know when all the deferreds have been processed (instead of the counter method in the OP):
from twisted.internet import reactor, defer, utils
from twisted.internet.threads import deferToThread
from urlparse import urlparse
import httplib
threadDeferred = deferToThread.__get__
#threadDeferred
def get_url_head(url_arg):
url = urlparse(url_arg)
conn = httplib.HTTPConnection(url.netloc)
conn.request("HEAD", url.path)
res = conn.getresponse()
conn.close()
return res.status
#defer.inlineCallbacks
def check_url(sem,url_arg):
yield sem.acquire()
try:
result = yield get_url_head(url_arg)
defer.returnValue(result)
finally:
sem.release()
#defer.inlineCallbacks
def run(reactor,SEMAPHORE_SIZE=10):
sem = defer.DeferredSemaphore(SEMAPHORE_SIZE)
deferreds = []
failed_urls = []
responded_urls = []
with open('urllist.txt','r') as f:
for line in f:
url_arg = line.strip()
d = check_url(sem,url_arg)
d.addCallback(processResult,url_arg,responded_urls).addErrback(processErr,url_arg,failed_urls)
deferreds.append(d)
res = yield defer.gatherResults(deferreds)
# Do something else with failed_urls and responded_urls
reactor.callLater(0,reactor.stop)
def main():
from twisted.internet import reactor
reactor.callWhenRunning(run,reactor)
reactor.run()
def processResult(result,url_arg,responded_urls):
print "Reponse %s from %s" % (result,url_arg)
responded_urls.append((url_arg,result))
def processErr(err,url_arg,failed_urls):
print "Error checking %s: %s" % (url_arg,repr(err.value))
failed_urls.append((url_arg,err.value))
if __name__ == '__main__':
main()
Related
i want to know how i can add simple threading to my code. At the moment it checks just one by one, and if some site isnt reachable it will wait for the timeout before it will continue with the next one this slows everything down.
import requests
import sys
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
with open("websites.txt", 'r') as websites:
websites = websites.read().splitlines()
with open("para1.txt", 'r') as para1:
para1 = para1.read().splitlines()
with open("para2.txt", 'r') as para2:
para2 = para2.read().splitlines()
def main():
for i in para1:
for j in para2:
for m in websites:
try:
res = requests.get(m + i + j, verify=False, timeout=10)
print(m + i + j)
if res.status_code == 200:
print('Yes')
else:
print('No')
except Exception as e:
print(e)
except KeyboardInterrupt:
sys.exit()
finally:
res.close()
time.sleep(1)
if __name__ == '__main__':
main()
You can apply ThreadPoolExecutor moving part of code which perform requests to separate function and pass it as argument:
import urllib3
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def check_func(url):
response = requests.get(url, verify=False, timeout=10)
return response.status_code == 200
def main():
with open("websites.txt") as website_f, open("para1.txt") as para1_f,
open("para2.txt", 'r') as para2_f, ThreadPoolExecutor(max_workers=4) as executor:
tasks = {}
for website in website_f:
for para1 in para1_f:
for para2 in para2_f:
url = website.rstrip() + para1.rstrip() + para2.rstrip()
tasks[executor.submit(check_func, url)] = url
for task in as_completed(tasks):
url = tasks[task]
try:
result = task.result()
except KeyboardInterrupt: # handling Ctrl + C
for task in tasks:
task.cancel() # won't cancel already finished or pending futures
except CancelledError: # will never happen (normally)
pass
except Exception as e:
print(url, "-", "ERROR", e)
else:
print(url, "-", "GOOD" if result else "BAD")
if __name__ == "__main__":
main()
P.S. I haven't tested entire code so if there're any problems with it - write in comments.
I've been using a function that I took from the book Web Scraping with Python from O'Really by Ryan Mitchell:
import sys
import os.path
import socket
import random
import urllib2
import contextlib
import diskCache
import logging as logger
from bs4 import BeautifulSoup
DEFAULT_AGENT = 'Mozilla/5.0 Firefox/56.0'
DEFAULT_DELAY = 3
DEFAULT_RETRIES = 10
DEFAULT_TIMEOUT = 60
socket.setdefaulttimeout (DEFAULT_TIMEOUT)
def download (url, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, \
cache=None, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, data=None):
result = None
if cache:
try:
result = cache[url]
except KeyError:
# url is not available in cache
pass
if result is not None and result['code'] is not None \
and num_retries > 0 and 500 <= result['code'] < 600:
# server error so ignore result from cache and re-download
result = None
if result is None:
proxy = random.choice(proxies) if proxies else None
headers = {'User-agent': user_agent}
result = call (url, headers, proxy=proxy, num_retries=num_retries, cache=cache)
if cache:
# save result to cache
cache[url] = result
return result['html']
def call (url, headers, proxy, num_retries, cache=None, data=None):
request = urllib2.Request(url, data, headers or {})
with contextlib.closing (urllib2.urlopen(request)) as connection:
try:
logger.info ('Downloading: %s', url)
html = connection.read ()
code = connection.getcode ()
except Exception as e:
logger.exception ('Download error:', str(e))
if cache:
del cache['url']
html = None
if hasattr (e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
return download (url, headers, num_retries-1, data) # retry server errors
else:
code = None
return {'html': html, 'code':code}
I wanted to know if there is a simpler way of handling the errors when downloading urls. I've seen that the requests library is a higher level and easier library and maybe it could simplify this. At the very least how would this code be for python3?
It would be something like
"""Functions used by the fetch module"""
# Standard library imports
import time
import socket
import logging as logger
from typing import Dict, Optional
# Third party imports
import requests
from requests.exceptions import HTTPError, Timeout
from bs4 import BeautifulSoup
# Constants
DEFAULT_AGENT = 'Mozilla/5.0 Firefox/56.0'
DEFAULT_DELAY = 3
DEFAULT_RETRIES = 10
DEFAULT_TIMEOUT = 60
socket.setdefaulttimeout(DEFAULT_TIMEOUT)
def fetch(url: str, retries: Optional[int] = DEFAULT_RETRIES) -> Dict:
"""Download an url"""
code = None
try:
logger.info('Downloading: %s', url)
resp = requests.get(url)
resp.raise_for_status()
code = resp.status_code
except (HTTPError, Timeout) as ex:
logger.exception("Couldn't download %s", ex)
return None
if code is not None and retries > 0 and \
500 <= code < 600: # Server error
logger.info('Retrying download')
time.sleep(DEFAULT_DELAY)
return fetch(url, retries-1)
return {'html': resp, 'code': code}
As you said this is a lot easier with requests
resp = requests.get(url, headers=headers, timeout=timeout)
print(resp.status_code)
print(resp.text)
# for an API use resp.json()
There is no exception raised by default. You can call resp.raise_for_status() if you do want to raise an exception.
See http://docs.python-requests.org/en/master/user/quickstart/ for details
error: (3, 'Illegal characters found in URL')
My url has got special characters like [AVC_(1)_(P1)_0]
i cant get this to work, i tried encoding but that would give me "Could not resolve host: https%3A"
Please advice
import sys
import Queue
import threading
import pycurl
import os
import urllib
from StringIO import StringIO
num_conn = 1
# Make a queue with (url, filename) tuples
queue = Queue.Queue()
with open('list.txt') as f:
for line in f:
print line
queue.put((line, 'test.mp4'))
if 'str' in line:
break
# Check args
assert queue.queue, "no URLs given"
num_urls = len(queue.queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
class WorkerThread(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while 1:
try:
url, filename = self.queue.get_nowait()
except Queue.Empty:
raise SystemExit
#dirname = os.path.dirname(filename)
#fp = open(dirname, "wb")\
#url = urllib.quote(url.encode('utf-8'))
fp = open(os.getcwd()+'/'+filename, "wb")
curl = pycurl.Curl()
curl.setopt(pycurl.URL, url)
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, 5)
curl.setopt(pycurl.CONNECTTIMEOUT, 30)
curl.setopt(pycurl.TIMEOUT, 300)
curl.setopt(pycurl.NOSIGNAL, 1)
curl.setopt(pycurl.WRITEDATA, fp)
try:
curl.perform()
except:
import traceback
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
curl.close()
fp.close()
sys.stdout.write(".")
sys.stdout.flush()
# Start a bunch of threads
threads = []
for dummy in range(num_conn):
t = WorkerThread(queue)
t.start()
threads.append(t)
# Wait for all threads to finish
for thread in threads:
thread.join()
Why not use requests in lieu of pycurl, which would make your run method:
def run(self):
while True:
try:
url, filename = self.queue.get_nowait()
except Queue.Empty:
raise SystemExit
with open(os.getcwd()+'/'+filename, "wb") as fp:
#fp.write(requests.get(url).content)
fp.write(requests.get(url, headers={'user-agent': 'CodeGuru'}).content
I made a few other, stylistic changes.
Just a little question : it's possible to force a build in Buildbot via a python script or command line (and not via the web interface) ?
Thank you!
If you have a PBSource configured in your master.cfg, you can send a change from the command line:
buildbot sendchange --master {MASTERHOST}:{PORT} --auth {USER}:{PASS}
--who {USER} {FILENAMES..}
You can make a python script using the urlib2 or requests library to simulate a POST to the web UI
import urllib2
import urllib
import cookielib
import uuid
import unittest
import sys
from StringIO import StringIO
class ForceBuildApi():
MAX_RETRY = 3
def __init__(self, server):
self.server = server
cookiejar = cookielib.CookieJar()
self.urlOpener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
def login(self, user, passwd):
data = urllib.urlencode(dict(username=user,
passwd=passwd))
url = self.server + "login"
request = urllib2.Request(url, data)
res = self.urlOpener.open(request).read()
if res.find("The username or password you entered were not correct") > 0:
raise Exception("invalid password")
def force_build(self, builder, reason, **kw):
"""Create a buildbot build request
several attempts are created in case of errors
"""
reason = reason + " ID="+str(uuid.uuid1())
kw['reason'] = reason
data_str = urllib.urlencode(kw)
url = "%s/builders/%s/force" % (self.server, builder)
print url
request = urllib2.Request(url, data_str)
file_desc = None
for i in xrange(self.MAX_RETRY):
try:
file_desc = self.urlOpener.open(request)
break
except Exception as e:
print >>sys.stderr, "error when doing force build", e
if file_desc is None:
print >>sys.stderr, "too many errors, giving up"
return None
for line in file_desc:
if 'alert' in line:
print >>sys.stderr, "invalid arguments", url, data_str
return None
if 'Authorization Failed' in line:
print >>sys.stderr, "Authorization Failed"
return
return reason
class ForceBuildApiTest(unittest.TestCase):
def setUp(self):
from mock import Mock # pip install mock for test
self.api = ForceBuildApi("server/")
self.api.urlOpener = Mock()
urllib2.Request = Mock()
uuid.uuid1 = Mock()
uuid.uuid1.return_value = "myuuid"
sys.stderr = StringIO()
def test_login(self):
from mock import call
self.api.login("log", "pass")
self.assertEquals(len(self.api.urlOpener.open.call_args_list), 1)
req = urllib2.Request.call_args_list
self.assertEquals([call('server/login', 'passwd=pass&username=log')], req)
def test_force(self):
from mock import call
self.api.urlOpener.open.return_value = ["blabla"]
r = self.api.force_build("builder1", reason="reason", param1="foo", param2="bar")
self.assertEquals(len(self.api.urlOpener.open.call_args_list), 1)
req = urllib2.Request.call_args_list
self.assertEquals([call('server//builders/builder1/force', 'reason=reason+ID%3Dmyuuid¶m2=bar¶m1=foo')], req)
self.assertEquals(r, "reason ID=myuuid")
def test_force_fail1(self):
from mock import call
self.api.urlOpener.open.return_value = ["alert bla"]
r = self.api.force_build("builder1", reason="reason", param1="foo", param2="bar")
self.assertEquals(len(self.api.urlOpener.open.call_args_list), 1)
req = urllib2.Request.call_args_list
self.assertEquals([call('server//builders/builder1/force', 'reason=reason+ID%3Dmyuuid¶m2=bar¶m1=foo')], req)
self.assertEquals(sys.stderr.getvalue(), "invalid arguments server//builders/builder1/force reason=reason+ID%3Dmyuuid¶m2=bar¶m1=foo\n")
self.assertEquals(r, None)
def test_force_fail2(self):
from mock import call
def raise_exception(*a, **kw):
raise Exception("oups")
self.api.urlOpener.open = raise_exception
r = self.api.force_build("builder1", reason="reason", param1="foo", param2="bar")
req = urllib2.Request.call_args_list
self.assertEquals([call('server//builders/builder1/force', 'reason=reason+ID%3Dmyuuid¶m2=bar¶m1=foo')], req)
self.assertEquals(sys.stderr.getvalue(), "error when doing force build oups\n"*3 + "too many errors, giving up\n")
self.assertEquals(r, None)
def test_force_fail3(self):
from mock import call
self.api.urlOpener.open.return_value = ["bla", "blu", "Authorization Failed"]
r = self.api.force_build("builder1", reason="reason", param1="foo", param2="bar")
req = urllib2.Request.call_args_list
self.assertEquals([call('server//builders/builder1/force', 'reason=reason+ID%3Dmyuuid¶m2=bar¶m1=foo')], req)
self.assertEquals(sys.stderr.getvalue(), "Authorization Failed\n")
self.assertEquals(r, None)
if __name__ == '__main__':
unittest.main()
Ok,
This should be simple, since people do it all the time. I want to get the body of a POST request sent a twisted Agent. This is created with a twisted FileBodyProducer. On the server side, I get a request object for my render_POST method.
How do I retrieve the body?
server:
from twisted.web import server, resource
from twisted.internet import reactor
class Simple(resource.Resource):
isLeaf = True
def render_GET(self, request):
return "{0}".format(request.args.keys())
def render_POST(self, request):
return "{0}".format(request.data)
with open(request.args['filename'][0], 'rb') as fd:
fd.write(request.write())
site = server.Site(Simple())
reactor.listenTCP(8080, site)
reactor.run()
client:
from StringIO import StringIO
from twisted.internet import reactor
from twisted.web.client import Agent
from twisted.web.http_headers import Headers
from twisted.web.client import FileBodyProducer
from twisted.internet.defer import Deferred
from twisted.internet.protocol import Protocol
from pprint import pformat
class BeginningPrinter(Protocol):
def __init__(self, finished):
self.finished = finished
self.remaining = 1024 * 10
def dataReceived(self, bytes):
if self.remaining:
display = bytes[:self.remaining]
print 'Some data received:'
print display
self.remaining -= len(display)
def connectionLost(self, reason):
print 'Finished receiving body:', reason.getErrorMessage()
self.finished.callback(None)
agent = Agent(reactor)
body = FileBodyProducer(StringIO("hello, world"))
d = agent.request(
'POST',
'http://127.0.0.1:8080/',
Headers({'User-Agent': ['Twisted Web Client Example'],
'Content-Type': ['text/x-greeting']}),
body)
def cbRequest(response):
print 'Response version:', response.version
print 'Response code:', response.code
print 'Response phrase:', response.phrase
print 'Response headers:'
print pformat(list(response.headers.getAllRawHeaders()))
finished = Deferred()
response.deliverBody(BeginningPrinter(finished))
return finished
d.addCallback(cbRequest)
def cbShutdown(ignored):
reactor.stop()
d.addBoth(cbShutdown)
reactor.run()
The only docs I can find for setting up the consumer side leave something to be desired. Primarily, how can a consumer use the write(data) method to receive results?
Which bit am I missing to plug these two components together?
All right, so it's as simple as calling request.content.read(). This, as far as I can tell, is undocumented in the API.
Here's the updated code for the client:
from twisted.internet import reactor
from twisted.web.client import Agent
from twisted.web.http_headers import Headers
from twisted.web.client import FileBodyProducer
from twisted.internet.defer import Deferred
from twisted.internet.protocol import Protocol
from pprint import pformat
class BeginningPrinter(Protocol):
def __init__(self, finished):
self.finished = finished
self.remaining = 1024 * 10
def dataReceived(self, bytes):
if self.remaining:
display = bytes[:self.remaining]
print 'Some data received:'
print display
self.remaining -= len(display)
def connectionLost(self, reason):
print 'Finished receiving body:', reason.getErrorMessage()
self.finished.callback(None)
class SaveContents(Protocol):
def __init__(self, finished, filesize, filename):
self.finished = finished
self.remaining = filesize
self.outfile = open(filename, 'wb')
def dataReceived(self, bytes):
if self.remaining:
display = bytes[:self.remaining]
self.outfile.write(display)
self.remaining -= len(display)
else:
self.outfile.close()
def connectionLost(self, reason):
print 'Finished receiving body:', reason.getErrorMessage()
self.outfile.close()
self.finished.callback(None)
agent = Agent(reactor)
f = open('70935-new_barcode.pdf', 'rb')
body = FileBodyProducer(f)
d = agent.request(
'POST',
'http://127.0.0.1:8080?filename=test.pdf',
Headers({'User-Agent': ['Twisted Web Client Example'],
'Content-Type': ['multipart/form-data; boundary=1024'.format()]}),
body)
def cbRequest(response):
print 'Response version:', response.version
print 'Response code:', response.code
print 'Response phrase:', response.phrase
print 'Response headers:'
print 'Response length:', response.length
print pformat(list(response.headers.getAllRawHeaders()))
finished = Deferred()
response.deliverBody(SaveContents(finished, response.length, 'test2.pdf'))
return finished
d.addCallback(cbRequest)
def cbShutdown(ignored):
reactor.stop()
d.addBoth(cbShutdown)
reactor.run()
And here's the server:
from twisted.web import server, resource
from twisted.internet import reactor
import os
# multi part encoding example: http://marianoiglesias.com.ar/python/file-uploading-with-multi-part-encoding-using-twisted/
class Simple(resource.Resource):
isLeaf = True
def render_GET(self, request):
return "{0}".format(request.args.keys())
def render_POST(self, request):
with open(request.args['filename'][0], 'wb') as fd:
fd.write(request.content.read())
request.setHeader('Content-Length', os.stat(request.args['filename'][0]).st_size)
with open(request.args['filename'][0], 'rb') as fd:
request.write(fd.read())
request.finish()
return server.NOT_DONE_YET
site = server.Site(Simple())
reactor.listenTCP(8080, site)
reactor.run()
I can now write the file contents I receive, and read back the results.
If the content type is application/x-www-form-urlencoded or multipart/form-data,
the body will be parsed and put in the request.args dict.
If the body is too big, it is written in temp file, otherwise in StringIO.
After the body is read, the method finish() is called. You can subclass Request and
pares the body in this method or do sth else.
if you want to make a simple POST with body (not a file) you can do as follows
import urllib
from twisted.internet import protocol
from twisted.internet import defer
from twisted.web.http_headers import Headers
from twisted.internet import reactor
from twisted.web.client import Agent
from twisted.web.iweb import IBodyProducer
from zope.interface import implements
from twisted.internet.defer import succeed
class StringProducer(object):
implements(IBodyProducer)
def __init__(self, body):
self.body = body
self.length = len(body)
def startProducing(self, consumer):
consumer.write(self.body)
return succeed(None)
def pauseProducing(self):
pass
def stopProducing(self):
pass
class SimpleReceiver(protocol.Protocol):
def __init__(self, d):
self.buf = ''; self.d = d
def dataReceived(self, data):
self.buf += data
def connectionLost(self, reason):
self.d.callback(self.buf)
def httpRequest(url, values=None, headers=None, method='POST'):
agent = Agent(reactor)
data = urllib.urlencode(values) if values else None
d = agent.request(method, url, Headers(headers) if headers else {},
StringProducer(data) if data else None
)
def handle_response(response):
if response.code == 204:
d = defer.succeed('')
else:
d = defer.Deferred()
response.deliverBody(SimpleReceiver(d))
return d
d.addCallback(handle_response)
return d
Now to use above in real code you can i.e.
d = httpRequest('htpp://...', post_data_as_dictionary, some_headers, 'POST')
d.addCallback(your_ok_callback_function)
d.addErrback(your_errorback_function)
Example headers should look like
headers = {'Accept' : ['application/json',],
'Content-Type': ['application/x-www-form-urlencoded',]
}
I hope that helps