Subprocess with 'While True' ends after 3640 iterations - python

I have a Django app that spawns a subprocess everytime there is a database insert.
models.py
# spawn subprocess to trigger tweepy
# output of subprocess DOES NOT log to the console.
def tweepy_tester(sender, **kwargs):
if kwargs['created']:
logger.error('tweepy trigger-start!')
p = subprocess.Popen([sys.executable, "/Users/viseshprasad/PycharmProjects/Blood_e_Merry/loginsignup/tests.py"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
logger.error('tweepy trigger-over!')
# use post_save to trigger tweepy later
post_save.connect(tweepy_tester, sender=User)
tests.py
logger = logging.getLogger(__name__)
# Create your tests here.
def for_thread():
i = 0
while True:
f = open('test.txt', 'a')
f.write('Tweepy triggered ' + str(i) + '\n') # python will convert \n to os.linesep
f.close() # you can omit in most cases as the destructor will call it
i += 1
for_thread()
The trigger happens fine but the subprocess only writes 3640 lines to the test.txt file, even though I have used while True:
I am basically look for a subprocess to run non-stop after the trigger, as a separate thread and not disturbing the main thread.
The purpose :
I run my app with the usual python manage.py runserver.
User signs-up -> database insert -> triggers my implementation of tweepy which keeps on streaming tweets and analyzing them non-stop on a different background thread so as to not interfere with the signup process.
The above test is for this purpose.
Any help is appreciated. Any alternative suggestions to implement this are also welcome.
Thanks.

Related

Python multiprocessing module do not work

i am trying to write a spider with multiprocessing module
here is my python code:
# -*- coding:utf-8 -*-
import multiprocessing
import requests
class SpiderWorker(object):
def __init__(self, q):
self._q = q
def run(self):
def _crawl_item(url):
requests.get("http://www.baidu.com")
if respon.ok:
print respon.url
while True:
rst = self._q.get()
_crawl_item(rst)
def general_worker():
q = multiprocessing.Queue()
CPU_COUNT = multiprocessing.cpu_count()
worker_processes = [
multiprocessing.Process(target=SpiderWorker(q).run)
for i in range(CPU_COUNT)
]
map( lambda process: process.start(), worker_processes )
return q, worker_processes
maybe it is my process way wrong
every time i run this code, my process tell me
<Process(Process-1, stopped[SIGSEGV])>
hope love it
The major problem here is that you don't have any information on why your processes fail. It could be gevent, but it could just as easily be something else. So learning the actual reason why your processes get terminated is the first step before doing anything else.
What you need is multiprocessing.log_to_stderr():
class SpiderWorker(object):
# ...
def run(self):
logger = multiprocessing.log_to_stderr()
logger.setLevel(multiprocessing.SUBDEBUG)
try:
# Here goes your original run() code
except Exception:
logger.exception('whoopsie')
What this code does:
Creates a special logger which will transmit it's information to the main process and dump it to stderr (console by default).
Configures this logger to report everything, including some internal multiprocessing module events (just in case as you probably don't need them).
Wraps your entire code in catch-all statement so whatever happens there cannot escape your notice.
Runs .exception() method on the logger, which not only logs the message (it's meaningless anyway as we don't know what actually happens) but most importantly logs the entire error traceback - which we actually need.

Using Popen in a thread blocks every incoming Flask-SocketIO request

I have the following situation:
I receive a request on a socketio server. I answer it (socket.emit(..)) and then start something with heavy computation load in another thread.
If the heavy computation is caused by subprocess.Popen (using subprocess.PIPE) it totally blocks every incoming request as long as it is being executed although it happens in a separate thread.
No problem - in this thread it was suggested to asynchronously read the result of the subprocess with a buffer size of 1 so that between these reads other threads have the chance to do something. Unfortunately this did not help for me.
I also already monkeypatched eventlet and that works fine - as long as I don't use subprocess.Popen with subprocess.PIPE in the thread.
In this code sample you can see that it only happens using subprocess.Popen with subprocess.PIPE. When uncommenting #functionWithSimulatedHeavyLoad() and instead comment functionWithHeavyLoad() everything works like charm.
from flask import Flask
from flask.ext.socketio import SocketIO, emit
import eventlet
eventlet.monkey_patch()
app = Flask(__name__)
socketio = SocketIO(app)
import time
from threading import Thread
#socketio.on('client command')
def response(data, type = None, nonce = None):
socketio.emit('client response', ['foo'])
thread = Thread(target = testThreadFunction)
thread.daemon = True
thread.start()
def testThreadFunction():
#functionWithSimulatedHeavyLoad()
functionWithHeavyLoad()
def functionWithSimulatedHeavyLoad():
time.sleep(5)
def functionWithHeavyLoad():
from datetime import datetime
import subprocess
import sys
from queue import Queue, Empty
ON_POSIX = 'posix' in sys.builtin_module_names
def enqueueOutput(out, queue):
for line in iter(out.readline, b''):
if line == '':
break
queue.put(line)
out.close()
# just anything that takes long to be computed
shellCommand = 'find / test'
p = subprocess.Popen(shellCommand, universal_newlines=True, shell=True, stdout=subprocess.PIPE, bufsize=1, close_fds=ON_POSIX)
q = Queue()
t = Thread(target = enqueueOutput, args = (p.stdout, q))
t.daemon = True
t.start()
t.join()
text = ''
while True:
try:
line = q.get_nowait()
text += line
print(line)
except Empty:
break
socketio.emit('client response', {'text': text})
socketio.run(app)
The client receives the message 'foo' after the blocking work in the functionWithHeavyLoad() function is completed. It should receive the message earlier, though.
This sample can be copied and pasted in a .py file and the behavior can be instantly reproduced.
I am using Python 3.4.3, Flask 0.10.1, flask-socketio1.2, eventlet 0.17.4
Update
If I put this into the functionWithHeavyLoad function it actually works and everything's fine:
import shlex
shellCommand = shlex.split('find / test')
popen = subprocess.Popen(shellCommand, stdout=subprocess.PIPE)
lines_iterator = iter(popen.stdout.readline, b"")
for line in lines_iterator:
print(line)
eventlet.sleep()
The problem is: I used find for heavy load in order to make the sample for you more easily reproducable. However, in my code I actually use tesseract "{0}" stdout -l deu as the sell command. This (unlike find) still blocks everything. Is this rather a tesseract issue than eventlet? But still: how can this block if it happens in a separate thread where it reads line by line with context switch when find does not block?
Thanks to this question I learned something new today. Eventlet does offer a greenlet friendly version of subprocess and its functions, but for some odd reason it does not monkey patch this module in the standard library.
Link to the eventlet implementation of subprocess: https://github.com/eventlet/eventlet/blob/master/eventlet/green/subprocess.py
Looking at the eventlet patcher, the modules that are patched are os, select, socket, thread, time, MySQLdb, builtins and psycopg2. There is absolutely no reference to subprocess in the patcher.
The good news is that I was able to work with Popen() in an application very similar to yours, after I replaced:
import subprocess
with:
from eventlet.green import subprocess
But note that the currently released version of eventlet (0.17.4) does not support the universal_newlines option in Popen, you will get an error if you use it. Support for this option is in master (here is the commit that added the option). You will either have to remove that option from your call, or else install the master branch of eventlet direct from github.

Python Django Asynchronous Request handling

I am working in an application where i am doing a huge data processing to generate a completely new set of data which is then finally saved to database. The application is taking a huge time in processing and saving the data to data base. I want to improve the user experience to some extent by redirecting user to result page first and then doing the data saving part in background(may be in the asynchronous way) . My problem is that for displaying the result page i need to have the new set of processed data. Is there any way that i can do so that the data processing and data saving part is done in background and whenever the data processing part is completed(before saving to database) i would get the processed data in result page?.
Asynchronous tasks can be accomplished in Python using Celery. You can simply push the task to Celery queue and the task will be performed in an asynchronous way. You can then do some polling from the result page to check if it is completed.
Other alternative can be something like Tornado.
Another strategy is to writing a threading class that starts up custom management commands you author to behave as worker threads. This is perhaps a little lighter weight than working with something like celery, and of course has both advantages and disadvantages. I also used this technique to sequence/automate migration generation/application during application startup (because it lives in a pipeline). My gunicorn startup script then starts these threads in pre_exec() or when_ready(), etc, as appropriate, and then stops them in on_exit().
# Description: Asychronous Worker Threading via Django Management Commands
# Lets you run an arbitrary Django management command, either a pre-baked one like migrate,
# or a custom one that you've created, as a worker thread, that can spin forever, or not.
# You can use this to take care of maintenance tasks at start-time, like db migration,
# db flushing, etc, or to run long-running asynchronous tasks.
# I sometimes find this to be a more useful pattern than using something like django-celery,
# as I can debug/use the commands I write from the shell as well, for administrative purposes.
import json
import os
import requests
import sys
import time
import uuid
import logging
import threading
import inspect
import ctypes
from django.core.management import call_command
from django.conf import settings
class DjangoWorkerThread(threading.Thread):
"""
Initializes a seperate thread for running an arbitrary Django management command. This is
one (simple) way to make asynchronous worker threads. There exist richer, more complex
ways of doing this in Django as well (django-cerlery).
The advantage of this pattern is that you can run the worker from the command line as well,
via manage.py, for the sake of rapid development, easy testing, debugging, management, etc.
:param commandname: name of a properly created Django management command, which exists
inside the app/management/commands folder in one of the apps in your project.
:param arguments: string containing command line arguments formatted like you would
when calling the management command via manage.py in a shell
:param restartwait: integer seconds to wait before restarting worker if it dies,
or if a once-through command, acts as a thread-loop delay timer
"""
def __init__(self, commandname,arguments="",restartwait=10,logger=""):
super(DjangoWorkerThread, self).__init__()
self.commandname = commandname
self.arguments = arguments
self.restartwait = restartwait
self.name = commandname
self.event = threading.Event()
if logger:
self.l = logger
else:
self.l = logging.getLogger('root')
def run(self):
"""
Start the thread.
"""
try:
exceptioncount = 0
exceptionlimit = 10
while not self.event.is_set():
try:
if self.arguments:
self.l.info('Starting ' + self.name + ' worker thread with arguments ' + self.arguments)
call_command(self.commandname,self.arguments)
else:
self.l.info('Starting ' + self.name + ' worker thread with no arguments')
call_command(self.commandname)
self.event.wait(self.restartwait)
except Exception as e:
self.l.error(self.commandname + ' Unkown error: {}'.format(str(e)))
exceptioncount += 1
if exceptioncount > exceptionlimit:
self.l.error(self.commandname + " : " + self.arguments + " : Exceeded exception retry limit, aborting.")
self.event.set()
finally:
self.l.info('Stopping command: ' + self.commandname + " " + self.arguments)
def stop(self):
"""Nice Stop
Stop nicely by setting an event.
"""
self.l.info("Sending stop event to self...")
self.event.set()
#then make sure it's dead...and schwack it harder if not.
#kill it with fire! be mean to your software. it will make you write better code.
self.l.info("Sent stop event, checking to see if thread died.")
if self.isAlive():
self.l.info("Still not dead, telling self to murder self...")
time.sleep( 0.1 )
os._exit(1)
def start_worker(command_name, command_arguments="", restart_wait=10,logger=""):
"""
Starts a background worker thread running a Django management command.
:param str command_name: the name of the Django management command to run,
typically would be a custom command implemented in yourapp/management/commands,
but could also be used to automate standard Django management tasks
:param str command_arguments: a string containing the command line arguments
to supply to the management command, formatted as if one were invoking
the command from a shell
"""
if logger:
l = logger
else:
l = logging.getLogger('root')
# Start the thread
l.info("Starting worker: "+ command_name + " : " + command_arguments + " : " + str(restart_wait) )
worker = DjangoWorkerThread(command_name,command_arguments, restart_wait,l)
worker.start()
l.info("Worker started: "+ command_name + " : " + command_arguments + " : " + str(restart_wait) )
# Return the thread instance
return worker
#<----------------------------------------------------------------------------->
def stop_worker(worker,logger=""):
"""
Gracefully shutsdown the worker thread
:param threading.Thread worker: the worker thread object
"""
if logger:
l = logger
else:
l = logging.getLogger('root')
# Shutdown the thread
l.info("Stopping worker: "+ worker.commandname + " : " + worker.arguments + " : " + str(worker.restartwait) )
worker.stop()
worker.join(worker.restartwait)
l.info("Worker stopped: "+ worker.commandname + " : " + worker.arguments + " : " + str(worker.restartwait) )
The long running task can be offloaded with Celery. You can still get all the updates and results. Your web application code should take care of polling for updates and results. http://blog.miguelgrinberg.com/post/using-celery-with-flask
explains how one can achieve this.
Some useful steps:
Configure celery with result back-end.
Execute the long running task asynchronously.
Let the task update its state periodically or when it executes some stage in job.
Poll from web application to get the status/result.
Display the results on UI.
There is a need for bootstrapping it all together, but once done it can be reused and it is fairly performant.
It's the same process that a synchronous request. You will use a View that should return a JsonResponse. The 'tricky' part is on the client side, where you have to make the async call to the view.

Capturing stdout from subprocess after sending SIGINT

I have a dtrace snippet run via python script and the dtrace snippet is such that it generates data when CTRL-C is issued to it. So I had a signal_handler defined in the python script to catch CTRL-C from user and relay this to the dtrace invocation done via subprocess.Popen but I am unable to get any output in my log file. Here is the script:
Proc = []
signal_posted = False
def signal_handler(sig, frame):
print("Got CTRL-C!")
global signal_posted
signal_posted = True
global Proc
Proc.send_signal(signal.SIGINT) #Signal posting from handler
def execute_hotkernel():
#
# Generate the .out output file
#
fileout = "hotkernel.out"
fileo = open(fileout, "w+")
global Proc
Proc = subprocess.Popen(['/usr/sbin/dtrace', '-n', dtrace_script], stdout = fileo)
while Proc.poll() is None:
time.sleep(0.5)
def main():
signal.signal(signal.SIGINT, signal_handler) # Change our signal handler
execute_hotkernel()
if __name__ == '__main__':
main()
Since I have a file hotkernel.out set in subprocess.Popen command for stdout I was expecting the output from dtrace to be redirected to hotkernel.out on doing a CTRL-C but it is empty. What is missing here?
I have a similar issue.
In my case, it's a shell script that runs until you hit Control-C, and then prints out summary information. When I run this using subprocess.Popen, whether using a PIPE or a file object for stdout, I either don't get the information (with a file object) or it hangs when I try to run stdout.readline().
I finally tried running the subprocess from the interpreter and discovered I could get the last line of output after the SIGINT with a PIPE if I call stdout.readline() (where it hangs) and hit Control-C (in the interpreter), and then call stdout.readline() again.
I do not know how to emulate this in script, for a file output or for a PIPE. I did not try the file output in the interpreter.
EDIT:
I finally got back to this and determined, it's actually pretty easy to emulate outside of python and really has nothing to do with python.
/some_cmd_that_ends_on_sigint
(enter control-c)
*data from stdout in event handler*
Works
/some_cmd_that_ends_on_sigint | tee some.log
(enter control-c)
*Nothing sent to stdout in event handler prints to the screen or the log*
Where's my log?
I ended up just adding a file stream in the event handler (in the some_cmd_that_ends_on_sigint source) that writes the data to a (possibly secondary) log. Works, if a bit awkward. You get the data on the screen if running without any piping, but I can also read it when piped or from python from the secondary log.

Using a Python script to start and stop the Google App Engine dev_appserver during continuous integration testing

I'm trying to write a Python script that will enable me to start the Google App Engine dev_appserver using coverage.py, fetch the /test url from the app that I launch, wait for the server to finish returning the page, then shutdown the dev_appserver, and then generate a report.
My challenge is how to launch the dev_appserver in the background so that I can do the http fetch and then how to shut down the dev_appserver before generating my report.
I'm heading towards something like this:
# get_gae_coverage.py
# Launch dev_appserver with coverge.py
coverage run --source=./ /usr/local/bin/dev_appserver.py --clear_datastore --use_sqlite .
#Fetch /test
urllib.urlopen('http://localhost:8080/test').read()
# Shutdown dev_appserver somehow
# ??
# Generate coverage report
coverage report
What is the best way to write a python script to do this?
You should go with subprocess Popen
import os
import signal
import subprocess
coverage_proc = subprocess.Popen(
['coverage','run', your_flag_list]
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
time.sleep(5) #Find the correct sleep value
urllib.urlopen('http://localhost:8080/test').read()
time.sleep(1)
os.kill(coverage_proc.pid, signal.SIGINT)
Here you can find another approach to test if the server is up and running:
line = proc.stdout.readline()
while '] Running application' not in line:
line = proc.stdout.readline()
threading is the way to accomplish such a kind of task. Namely, you start the dev_appserver in a thread or in the main thread and as it is running, run and collect the results using the coverage module and then kill the dev_appserver python process in another thread and you will have results from coverage.
Here is sample snippet, which runs the dev_appserver.py in a thread and then waits for 10 seconds before and then it kills the python process. You can modify the end method in a suitable wherein the instead of waiting for 10 seconds, it waits for few seconds (in order to let the python process start) and then start doing the coverage testing and after it is done, kill the appserver and finish coverage.
import threading
import subprocess
import time
hold_process = []
def start():
print 'In the start process'
proc = subprocess.Popen(['/usr/bin/python','dev_appserver.py','yourapp'])
hold_process.append(proc)
def end():
time.sleep(10)
proc = hold_process.pop(0)
print 'Killing the appserver process'
proc.kill()
t = threading.Thread(name='startprocess',target=start)
t.deamon = True
w = threading.Thread(name='endprocess',target=end)
t.start()
w.start()
t.join()
w.join()

Categories