import threading
import urllib2
import time
import webapp2
import main
start = time.time()
url = "http://exmple.com?phone="
class BatchSuscriber(webapp2.RequestHandler):
def get(self):
template = main.JINJA_ENVIRONMENT.get_template('batch.html')
self.response.out.write(template.render())
def post(self):
address = self.request.get('address')
numbers = str(self.request.get('numbers')).split(',')
threads = [threading.Thread(target=self.fetch_url, args=(phone,)) for phone in numbers]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
self.response.write("Elapsed Time: %s" % (time.time() - start))
self.response.write("<br>")
def fetch_url(self,phone):
urlHandler = urllib2.urlopen(url+phone)
html = urlHandler.read()
self.response.write(html)
self.response.write("<br>")
self.response.write("'%s\' fetched in %ss" % (url+phone, (time.time() - start)))
self.response.write("<br>")
trying to use the above code to make urlfetch asynchronously. From my log, it seems the call is actually serially instead of being parallel. What ways can i achieve this in gae. Thanks.
Trying to use threads is entirely the wrong approach here. GAE already includes an asynchronous requests service in google.appengine.api.urlfetch; you should use that.
Related
Currently, I got some little Python Script running, creating some Web-Requests.
I am absolute new to Python, so I took a bare-bones Script I found, and it uses Multi-Threads (see end of thread for the full Script):
if __name__ == '__main__':
threads = []
for i in range(THREAD_COUNT):
t = Thread(target=callback)
threads.append(t)
t.start()
for t in threads:
t.join()
However, I feel this Script is kinda slow, like it does the Requests after each other and not at the same time.
So I took another approach and tried to find more about Workers and Multi-Threads.
It seems "Workers" are the Way to go, instead of Threads?
So I took the following from a Tutorial and modified it a little:
import logging
import os
from queue import Queue
from threading import Thread
from time import time
from multi import callback
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class DownloadWorker(Thread):
def __init__(self, queue):
Thread.__init__(self)
self.queue = queue
def run(self):
while True:
# that is my Function in Multi.py (A simple Web Request Function)
try:
callback()
finally:
self.queue.task_done()
if __name__ == '__main__':
ts = time()
queue = Queue()
for x in range(8):
worker = DownloadWorker(queue)
worker.daemon = True
worker.start()
# I put that here, because I want to run my "Program" infinite times
for i in range(500000):
logger.info('Queueing')
queue.put(i)
queue.join()
logging.info('Took %s', time() - ts)
I am not sure here, if that is the correct approach, from my Understanding I created 8 Workers and with the queue.put(i). I give them Jobs (500,000 in this Case?) passing them the current counter (which does nothing, it seems to be required tho?)
After he is done queening, the Function is executed, as I can see in my Console.
However, I feel it still runs same slow as before?
(My Original Request File)
from threading import Thread
import requests
import json
import string
import urllib3
import threading
THREAD_COUNT = 5
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def callback():
counter = 0
try:
while True:
print("Prozess " + str(threading.get_ident())+ " " +str(counter))
counter = counter + 1
response = requests.post('ourAPIHere',verify=False, json={"pingme":"hello"})
json_data = json.loads(response.text)
if json_data["status"] == "error":
print("Server Error? Check logs!")
if json_data["status"] == "success":
print("OK")
except KeyboardInterrupt:
return
if __name__ == '__main__':
threads = []
for i in range(THREAD_COUNT):
t = Thread(target=callback)
threads.append(t)
t.start()
for t in threads:
t.join()
I have the problem that I need to write values generated by a consumer to disk. I do not want to open a new instance of a file to write every time, so I thought to use a second queue and a other consumer to write to disk from a singe Greenlet. The problem with my code is that the second queue does not get consumed async from the first queue. The first queue finishes first and then the second queue gets consumed.
I want to write values to disk at the same time then other values get generated.
Thanks for help!
#!/usr/bin/python
#- * -coding: utf-8 - * -
import gevent #pip install gevent
from gevent.queue import *
import gevent.monkey
from timeit import default_timer as timer
from time import sleep
import cPickle as pickle
gevent.monkey.patch_all()
def save_lineCount(count):
with open("count.p", "wb") as f:
pickle.dump(count, f)
def loader():
for i in range(0,3):
q.put(i)
def writer():
while True:
task = q_w.get()
print "writing",task
save_lineCount(task)
def worker():
while not q.empty():
task = q.get()
if task%2:
q_w.put(task)
print "put",task
sleep(10)
def asynchronous():
threads = []
threads.append(gevent.spawn(writer))
for i in range(0, 1):
threads.append(gevent.spawn(worker))
start = timer()
gevent.joinall(threads,raise_error=True)
end = timer()
#pbar.close()
print "\n\nTime passed: " + str(end - start)[:6]
q = gevent.queue.Queue()
q_w = gevent.queue.Queue()
gevent.spawn(loader).join()
asynchronous()
In general, that approach should work fine. There are some problems with this specific code, though:
Calling time.sleep will cause all greenlets to block. You either need to call gevent.sleep or monkey-patch the process in order to have just one greenlet block (I see gevent.monkey imported, but patch_all is not called). I suspect that's the major problem here.
Writing to a file is also synchronous and causes all greenlets to block. You can use FileObjectThread if that's a major bottleneck.
I have jobs scheduled thru apscheduler. I have 3 jobs so far, but soon will have many more. i'm looking for a way to scale my code.
Currently, each job is its own .py file, and in the file, I have turned the script into a function with run() as the function name. Here is my code.
from apscheduler.scheduler import Scheduler
import logging
import job1
import job2
import job3
logging.basicConfig()
sched = Scheduler()
#sched.cron_schedule(day_of_week='mon-sun', hour=7)
def runjobs():
job1.run()
job2.run()
job3.run()
sched.start()
This works, right now the code is just stupid, but it gets the job done. But when I have 50 jobs, the code will be stupid long. How do I scale it?
note: the actual names of the jobs are arbitrary and doesn't follow a pattern. The name of the file is scheduler.py and I run it using execfile('scheduler.py') in python shell.
import urllib
import threading
import datetime
pages = ['http://google.com', 'http://yahoo.com', 'http://msn.com']
#------------------------------------------------------------------------------
# Getting the pages WITHOUT threads
#------------------------------------------------------------------------------
def job(url):
response = urllib.urlopen(url)
html = response.read()
def runjobs():
for page in pages:
job(page)
start = datetime.datetime.now()
runjobs()
end = datetime.datetime.now()
print "jobs run in {} microseconds WITHOUT threads" \
.format((end - start).microseconds)
#------------------------------------------------------------------------------
# Getting the pages WITH threads
#------------------------------------------------------------------------------
def job(url):
response = urllib.urlopen(url)
html = response.read()
def runjobs():
threads = []
for page in pages:
t = threading.Thread(target=job, args=(page,))
t.start()
threads.append(t)
for t in threads:
t.join()
start = datetime.datetime.now()
runjobs()
end = datetime.datetime.now()
print "jobs run in {} microsecond WITH threads" \
.format((end - start).microseconds)
Look #
http://furius.ca/pubcode/pub/conf/bin/python-recursive-import-test
This will help you import all python / .py files.
while importing you can create a list which keeps keeps a function call, for example.
[job1.run(),job2.run()]
Then iterate through them and call function :)
Thanks Arjun
I am trying to develop a downloader app in pygtk
So when a user adds a url following actions happen
addUrl()
which calls
validateUrl()
getUrldetails()
So it took a little while to add the url to the list because of urllib.urlopen delay
so i tried to implement threads. I added the following code to main window
thread.start_new_thread(addUrl, (self,url, ))
I passed a reference to the main window so that i can access the list from thread
but nothing seems to happen
I think that you check this thread first How to use threading in Python?.
for example:
import Queue
import threading
import urllib2
# called by each thread
def get_url(q, url):
q.put(urllib2.urlopen(url).read())
theurls = '''http://google.com http://yahoo.com'''.split()
q = Queue.Queue()
for u in theurls:
t = threading.Thread(target=get_url, args = (q,u))
t.daemon = True
t.start()
s = q.get()
print s
Hope this helps you.
I need to do a blocking xmlrpc call from my python script to several physical server simultaneously and perform actions based on response from each server independently.
To explain in detail let us assume following pseudo code
while True:
response=call_to_server1() #blocking and takes very long time
if response==this:
do that
I want to do this for all the servers simultaneously and independently but from same script
Use the threading module.
Boilerplate threading code (I can tailor this if you give me a little more detail on what you are trying to accomplish)
def run_me(func):
while not stop_event.isSet():
response= func() #blocking and takes very long time
if response==this:
do that
def call_to_server1():
#code to call server 1...
return magic_server1_call()
def call_to_server2():
#code to call server 2...
return magic_server2_call()
#used to stop your loop.
stop_event = threading.Event()
t = threading.Thread(target=run_me, args=(call_to_server1))
t.start()
t2 = threading.Thread(target=run_me, args=(call_to_server2))
t2.start()
#wait for threads to return.
t.join()
t2.join()
#we are done....
You can use multiprocessing module
import multiprocessing
def call_to_server(ip,port):
....
....
for i in xrange(server_count):
process.append( multiprocessing.Process(target=call_to_server,args=(ip,port)))
process[i].start()
#waiting process to stop
for p in process:
p.join()
You can use multiprocessing plus queues. With one single sub-process this is the example:
import multiprocessing
import time
def processWorker(input, result):
def remoteRequest( params ):
## this is my remote request
return True
while True:
work = input.get()
if 'STOP' in work:
break
result.put( remoteRequest(work) )
input = multiprocessing.Queue()
result = multiprocessing.Queue()
p = multiprocessing.Process(target = processWorker, args = (input, result))
p.start()
requestlist = ['1', '2']
for req in requestlist:
input.put(req)
for i in xrange(len(requestlist)):
res = result.get(block = True)
print 'retrieved ', res
input.put('STOP')
time.sleep(1)
print 'done'
To have more the one sub-process simply use a list object to store all the sub-processes you start.
The multiprocessing queue is a safe object.
Then you may keep track of which request is being executed by each sub-process simply storing the request associated to a workid (the workid can be a counter incremented when the queue get filled with new work). Usage of multiprocessing.Queue is robust since you do not need to rely on stdout/err parsing and you also avoid related limitation.
Then, you can also set a timeout on how long you want a get call to wait at max, eg:
import Queue
try:
res = result.get(block = True, timeout = 10)
except Queue.Empty:
print error
Use twisted.
It has a lot of useful stuff for work with network. It is also very good at working asynchronously.