The code reads urls from file and push it to queue assigned to thread and do third party web api call in order to get result that goes to the global list.
When I execute this program sometime it will go to the end and finishes process(printing done) sometime it is stuck and hold the process never finishes.
It seems like if there is an exception("We failed to reach a server") it holds the process and never finishes. I believe that it is thread problem.
Any body can figure it out what is the issue please. Thank you in advance
Here is the code
import threading
import Queue
import hmac
import hashlib
import base64
import urllib2
from urllib2 import Request, urlopen, URLError, HTTPError
import sys
import httplib, urllib, time, random, os
import json
from urlparse import urlparse
import time
#Number of threads
n_thread = 50
#Create queue
queue = Queue.Queue()
domainBlacklistDomain=[]
urlList=[]
def checkBlackList(domain,line):
testUrl = 'https://test.net'
apiToken = 'aaaaa'
secretKey = 'bbbb'
signature_data = 'GET\n/v1/blacklist/lookup\nurl='+domain+'\n\n\n'
digest = hmac.new(secretKey, signature_data, hashlib.sha1).digest()
digest_base64 = base64.encodestring(digest)
req = urllib2.Request('https://test.net/v1/blacklist/lookup?url='+domain)
req.add_header('Authorization', 'Test' + apiToken + ':' + digest_base64)
req.add_header('Connection', 'Keep-Alive')
try:
page = urlopen(req)
length = str(page.info())
if length.find("Content-Length: 0") != -1:
url=str(line.strip())
urlList.append(url)
else:
json_data=json.load(page)
domainBlacklistDomain.append(json_data['url'])
if int(json_data['score']) >10:
print json_data['url']
except HTTPError, e:
print 'The server couldn\'t fulfill the request.'
except URLError, e:
print 'We failed to reach a server.'
class ThreadClass(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
#Assign thread working with queue
self.queue = queue
def run(self):
while True:
#Get from queue job
host = self.queue.get()
parsed_uri = urlparse(host)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
if "\n" in domain:
domain=domain.replace('\n', '').replace('\r', '')
if domain not in domainBlacklistDomain:
checkBlackList(domain,host):
else:
if domain not in domainBlacklistDomain:
checkBlackList(domain,host):
#signals to queue job is done
self.queue.task_done()
#Create number process
for i in range(n_thread):
t = ThreadClass(queue)
t.setDaemon(True)
#Start thread
t.start()
#Read file line by line
hostfile = open("result_url.txt","r")
for line in hostfile:
#Put line to queue
queue.put(line)
#wait on the queue until everything has been processed
queue.join()
fo=open("final_result.txt","w+b")
for item in urlList:
fo.write("%s\n" %item)
print "done??"
Without reading your code in detail, the issue is almost certainly to do with trying to establish a connection to a non-responsive IP address. The timeouts on these connections can be lengthy.
Try using the socket.setdefaulttimeout() function to establish a global socket timeout.
Related
The code below is a simplified version of a Tornado based TCP server that is currently used to host a Videotex system. This code was derived from the Tornado documentation and the server has been running in a live environment for some time without issue, however, there is a feature I need to add.
The system currently blocks until a character is received from the client before returning the data via the stream.write. As the system typically runs at 1200 baud at the client end (via a telnet modem), this means that the user has to wait until all stream writes have completed before the next 'user entered' character is processed.
What I would like to do is find a way that would allow me to abandon writing data to stream.write if another character is received form the client.
I am new to Tornado and fairly new to Python, however, I have coded asynchronous functions and threaded solutions in the past using C#.
From the documentation the stream.write operation is asynchronous, I am assuming therefore that the call may return before the data is completely written, I am left thinking that I need a method to abandon/empty/advance the write buffer to stop the write operation if a new char is detected on the stream.read.
One option that would seem to give me what I need is to somehow perform the stream.writes on another thread , however, this approach seems inappropriate when using Tornado's IOLoop etc.
Is there a way to give me the facility I am after? I have full control of the code and am happy to restructure the app if needed.
import logging
import struct
import os
import traceback
from tornado import gen
from tornado.ioloop import IOLoop
from tornado.iostream import StreamClosedError
from tornado.tcpserver import TCPServer
# Configure logging.
logger = logging.getLogger(os.path.basename(__file__))
logger.setLevel(logging.INFO)
# Cache this struct definition; important optimization.
int_struct = struct.Struct("<i")
_UNPACK_INT = int_struct.unpack
_PACK_INT = int_struct.pack
class TornadoServer(TCPServer):
def start(self, port):
self.port = port
server.listen(port)
#gen.coroutine
def handle_stream(self, stream, address):
logging.info("[viewdata] Connection from client address {0}.".format(address))
try:
while True:
char = yield stream.read_bytes(1) # this call blocks
asc = ord(char)
logger.info('[viewdata] Byte Received {0} ({1})'.format(hex(asc), asc))
# Do some processing using the received char and return the appropriate page of data
stream.write('This is the data you asked for...'.encode())
except StreamClosedError as ex:
logger.info("[viewdata] {0} Disconnected: {1} Message: {2}".format(address, type(ex), str(ex)))
except Exception as ex:
logger.error("[viewdata] {0} Exception: {1} Message: {2}".format(address, type(ex), str(ex)))
logger.error(traceback.format_exc())
if __name__ == '__main__':
server = TornadoServer()
server.start(25232)
loop = IOLoop.current()
loop.start()
The main idea is that you move long processing into separate task.
When you receive some new data, you choose what to do (in case below I cancel current operation)
import logging
import os
import traceback
import threading
from tornado import gen
from tornado.ioloop import IOLoop
from tornado.iostream import StreamClosedError
from tornado.tcpserver import TCPServer
# Configure logging.
logger = logging.getLogger(os.path.basename(__file__))
logger.setLevel(logging.INFO)
class TornadoServer(TCPServer):
def start(self, port):
self.port = port
server.listen(port)
async def process_stream(self, stream, char, cancel_event):
asc = ord(char)
logger.info('[viewdata] Byte Received {0} ({1})'.format(hex(asc), asc))
N = 5
for i in range(N):
if cancel_event.is_set():
logger.info('[viewdata] Abort streaming')
break
# Do some processing using the received char and return the appropriate page of data
msg = 'This is the {0} data you asked for...'.format(i)
logger.info(msg)
await stream.write('This is the part {0} of {1} you asked for...'.format(i, N).encode())
await gen.sleep(1.0) # make this processing longer..
async def handle_stream(self, stream, address):
process_stream_future = None
cancel_event = None
logging.info("[viewdata] Connection from client address {0}.".format(address))
while True:
try:
char = await stream.read_bytes(1) # this call blocks
# when received client input, cancel running job
if process_stream_future:
process_stream_future.cancel()
if cancel_event:
cancel_event.set()
cancel_event = threading.Event()
process_stream_future = gen.convert_yielded(
self.process_stream(stream, char, cancel_event))
self.io_loop.add_future(process_stream_future, lambda f: f.result())
except StreamClosedError as ex:
logger.info("[viewdata] {0} Disconnected: {1} Message: {2}".format(address, type(ex), str(ex)))
except Exception as ex:
logger.error("[viewdata] {0} Exception: {1} Message: {2}".format(address, type(ex), str(ex)))
logger.error(traceback.format_exc())
if __name__ == '__main__':
server = TornadoServer()
server.listen(25232)
loop = IOLoop.current()
loop.start()
After a long time writing this small script, it finally worked; or better to say, it's almost done. I am having just a small problem. I am not able to send COOKIES as add.headers in the urllib.request. What am I doing wrong? I need to send a given COOKIE or the website will not allow me to download the .pdf file, but I believe I'm doing this the wrong way.
Here is my code; please let me know what is wrong:
import os
import threading
import urllib.request
from queue import Queue
class Downloader(threading.Thread):
"""Threaded File Downloader"""
def __init__(self, queue):
"""Initialize the thread"""
threading.Thread.__init__(self)
self.queue = queue
def run(self):
"""Run the thread"""
while True:
# gets the url from the queue
url = self.queue.get()
# download the file
self.download_file(url)
# send a signal to the queue that the job is done
self.queue.task_done()
def download_file(self, url):
"""Download the file"""
handle = urllib.request.urlopen(url)
faturanum = 20184009433300
fatura = str(faturanum)
fname = fatura + ".pdf"
handle.addheaders = [('Cookie', 'ASP.NET_SessionId=zstuzktl0x1laoqhxgkm4ign')]
with open(fname, "wb") as f:
while True:
chunk = handle.read(1024)
if not chunk: break
f.write(chunk)
def main(urls):
"""
Run the program
"""
queue = Queue()
# create a thread pool and give them a queue
for i in range(5):
t = Downloader(queue)
t.setDaemon(True)
t.start()
# give the queue some data
for url in urls:
queue.put(url)
# wait for the queue to finish
queue.join()
if __name__ == "__main__":
urls = ["https://pagamentodigitaltsting.com/Fatura/Pdf?nrFatura=20193981821"]
main(urls)
What is wrong that I can not send cookies with the request? As you can see, the website is being served through https. Once the page loads, it renders a pdf file.
I haven't done twisted programming in a while so I'm trying to get back into it for a new project. I'm attempting to set up a twisted client that can take a list of servers as an argument, and for each server it sends an API GET call and writes the return message to a file. This API GET call should be repeated every 60 seconds.
I've done it successfully with a single server using Twisted's agent class:
from StringIO import StringIO
from twisted.internet import reactor
from twisted.internet.protocol import Protocol
from twisted.web.client import Agent
from twisted.web.http_headers import Headers
from twisted.internet.defer import Deferred
import datetime
from datetime import timedelta
import time
count = 1
filename = "test.csv"
class server_response(Protocol):
def __init__(self, finished):
print "init server response"
self.finished = finished
self.remaining = 1024 * 10
def dataReceived(self, bytes):
if self.remaining:
display = bytes[:self.remaining]
print 'Some data received:'
print display
with open(filename, "a") as myfile:
myfile.write(display)
self.remaining -= len(display)
def connectionLost(self, reason):
print 'Finished receiving body:', reason.getErrorMessage()
self.finished.callback(None)
def capture_response(response):
print "Capturing response"
finished = Deferred()
response.deliverBody(server_response(finished))
print "Done capturing:", finished
return finished
def responseFail(err):
print "error" + err
reactor.stop()
def cl(ignored):
print "sending req"
agent = Agent(reactor)
headers = {
'authorization': [<snipped>],
'cache-control': [<snipped>],
'postman-token': [<snipped>]
}
URL = <snipped>
print URL
a = agent.request(
'GET',
URL,
Headers(headers),
None)
a.addCallback(capture_response)
reactor.callLater(60, cl, None)
#a.addBoth(cbShutdown, count)
def cbShutdown(ignored, count):
print "reactor stop"
reactor.stop()
def parse_args():
usage = """usage: %prog [options] [hostname]:port ...
Run it like this:
python test.py hostname1:instanceName1 hostname2:instancename2 ...
"""
parser = optparse.OptionParser(usage)
_, addresses = parser.parse_args()
if not addresses:
print parser.format_help()
parser.exit()
def parse_address(addr):
if ':' not in addr:
hostName = '127.0.0.1'
instanceName = addr
else:
hostName, instanceName = addr.split(':', 1)
return hostName, instanceName
return map(parse_address, addresses)
if __name__ == '__main__':
d = Deferred()
d.addCallbacks(cl, responseFail)
reactor.callWhenRunning(d.callback, None)
reactor.run()
However I'm having a tough time figuring out how to have multiple agents sending calls. With this, I'm relying on the end of the write in cl() ---reactor.callLater(60, cl, None) to create the call loop. So how do I create multiple call agent protocols (server_response(Protocol)) and continue to loop through the GET for each of them once my reactor is started?
Look what the cat dragged in!
So how do I create multiple call agent
Use treq. You rarely want to get tangled up with the Agent class.
This API GET call should be repeated every 60 seconds
Use LoopingCalls instead of callLater, in this case it's easier and you'll run into less problems later.
import treq
from twisted.internet import task, reactor
filename = 'test.csv'
def writeToFile(content):
with open(filename, 'ab') as f:
f.write(content)
def everyMinute(*urls):
for url in urls:
d = treq.get(url)
d.addCallback(treq.content)
d.addCallback(writeToFile)
#----- Main -----#
sites = [
'https://www.google.com',
'https://www.amazon.com',
'https://www.facebook.com']
repeating = task.LoopingCall(everyMinute, *sites)
repeating.start(60)
reactor.run()
It starts in the everyMinute() function, which runs every 60 seconds. Within that function, each endpoint is queried and once the contents of the response becomes available, the treq.content function takes the response and returns the contents. Finally the contents are written to a file.
PS
Are you scraping or trying to extract something from those sites? If you are scrapy might be a good option for you.
I have A simple Client which sends a request to server and receives a response :
from StringIO import StringIO
from twisted.internet import reactor
from twisted.internet.protocol import Protocol
from twisted.web.client import Agent
from twisted.web.http_headers import Headers
from twisted.internet.defer import Deferred
from twisted.web.client import FileBodyProducer
import log , time
class server_response(Protocol):
def __init__(self, finished):
self.finished = finished
self.remaining = 1024 * 10
def dataReceived(self, bytes):
if self.remaining:
reply = bytes[:self.remaining]
print "reply from server:" , reply
log.info(reply)
def connectionLost(self, reason):
#print 'Finished receiving body:', reason.getErrorMessage()
self.finished.callback(None)
def capture_response(response):
finished = Deferred()
response.deliverBody(server_response(finished))
return finished
def cl():
xml_str = "<xml>"
agent = Agent(reactor)
body = FileBodyProducer(StringIO(xml_str))
d = agent.request(
'PUT',
'http://localhost:8080/',
Headers({'User-Agent': ['Replication'],
'Content-Type': ['text/x-greeting']}),
body)
d.addCallback(capture_response)
def cbShutdown(ignored):
reactor.stop()
d.addBoth(cbShutdown)
reactor.run()
if __name__ == '__main__':
count = 1
while (count < 5) :
print count
cl()
time.sleep(2)
count = count + 1
here in main, i am trying to send the request to server by invoking cl() 5 times in a while loop . but i am receiving some error, what i am assuming is that i have not stopped the client hence reactor is not starting, how do i solve this problem
Unfortunately, the Twisted reactor cannot be restarted. Once you have done reactor.stop() you cannot do reactor.start() again.
Instead you need to do something like chaining the runs so that the callback for one run finishing will cause the next run to be started, or then schedule the runs with reactor.callLater().
I read up about threading in the IBM developer sources and found the following example.
In general I understand what happens here, except for one important thing. The work seems to be done in the run() function. In this example run() only prints a line and signals to the queue, that the job is done.
What if I had to return some processed data? I thought about caching it in a global variable, and to access this one later, but this seems not the right way to go.
Any advice?
Perhaps I should clearify: My intuition tells me to add return processed_data to run() right after self.queue.task_done(), but I can't figure out where to catch that return, since it is not obvious to me where run() is called.
#!/usr/bin/env python
import Queue
import threading
import urllib2
import time
hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
"http://ibm.com", "http://apple.com"]
queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and prints first 1024 bytes of page
url = urllib2.urlopen(host)
print url.read(1024)
#signals to queue job is done
self.queue.task_done()
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(5):
t = ThreadUrl(queue)
t.setDaemon(True)
t.start()
#populate queue with data
for host in hosts:
queue.put(host)
#wait on the queue until everything has been processed
queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)
You can't return a value from run, and in any case there is normally more than one item to process in each thread, so you don't want to return at all after processing one value (see the while loop in each thread).
I would either use another queue to return the results:
queue = Queue.Queue()
out_queue = Queue.Queue()
class ThreadUrl(threading.Thread):
...
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and saves first 1024 bytes of page
url = urllib2.urlopen(host)
out_queue.put(url.read(1024))
#signals to queue job is done
self.queue.task_done()
...
def main():
...
#populate queue with data
for host in hosts:
queue.put(host)
#don't have to wait until everything has been processed if we don't want to
for _ in range(len(hosts)):
first_1k = out_queue.get()
print first_1k
or store the result in the same queue:
class WorkItem(object):
def __init__(self, host):
self.host = host
class ThreadUrl(threading.Thread):
...
def run(self):
while True:
#grabs host from queue
work_item = self.queue.get()
host = work_item.host
#grabs urls of hosts and saves first 1024 bytes of page
url = urllib2.urlopen(host)
work_item.first_1k = url.read(1024)
#signals to queue job is done
self.queue.task_done()
...
def main():
...
#populate queue with data
work_items = [WorkItem(host) for host in hosts]
for item in work_items:
queue.put(item)
#wait on the queue until everything has been processed
queue.join()
for item in work_items:
print item.first_1k
the problem with using the queue method is : the order in which the threads may complete is random . Hence the queue item may not necessarily reflect the result of that specific position .
In this example , if google.com gets done before yahoo.com , then the queue has google data before yahoo data, so when retrieving it , the results are incorrect.