I have async socket server file and client file.
When i send something like that "download filename.ex" to the client, this client's code hande my request:
try:
content = read(sp_data[-1]).decode('utf-8')
print(content)
msg = json.dumps({'file': sp_data[-1], 'command': data, 'content': content,
'msg': f'[+] File {sp_data[-1]} has been successfully downloaded.'}).encode('utf-8')
except FileNotFoundError:
msg = json.dumps({'msg': f'[-] File {sp_data[-1]} not found', 'command': data}).encode('utf-8')
s.send(msg)
When client send some data to the socketserver, this server's code handle received message:
def recv_message(client_socket):
global messages
data = json.loads(client_socket.recv(4096).decode('utf-8').strip()) ##Important here i got this error json.decoder.JSONDecodeError: Unterminated string starting at: line 1 column 67 (char 66)
raddr = get_raddr(str(client_socket))
raddr = f'{raddr[0]}:{raddr[1]}'
message = f'From: {raddr}\nCommand: {data["command"]}\nOutput: \n\n{data["msg"]}'
try:
d = messages[raddr]
d.append(message)
messages[raddr] = d
except KeyError:
messages[raddr] = [message]
except AttributeError:
print(message, messages)
if 'content' in data.keys(): ##Important
print(data['content'])
threading.Thread(target=create_file, args=(data['file'], data['content'],), daemon=False).start()
Error:
data = json.loads(client_socket.recv(4096).decode('utf-8').strip())
json.decoder.JSONDecodeError: Unterminated string starting at: line 1 column 67 (char 66)
But server's code above give me this error when it receive message from the first code(when i send something like that "download file.ex" to the client, client detect my command as its special command, execute the first code, send json file to the server. But if i send "dir" command to the client, it will detect my command like shell command, will run command through subprocess, will send result to the server back and i won't get any errors.)
Note: I also reduced socketserver's code. Therefore, something in my code can work worse. The main goal of this post - make download feature works. I also understand that my code is big. I left "##Important" comments in my files. U can watch only code that located by these comments.
Server:
import selectors
import socket
import threading
import json
import base64
import shlex
selector = selectors.DefaultSelector()
connections = {}
def accept_conn(server_socket):
sock, addr = server_socket.accept()
connections[len(connections) + 1] = [sock, f'{addr[0]}:{addr[-1]}']
selector.register(fileobj=sock, events=selectors.EVENT_READ, data=recv_message)
s = socket.socket()
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
s.bind(('localhost', 4444))
s.listen()
selector.register(fileobj=s, events=selectors.EVENT_READ, data=accept_conn)
messages = {}
##Important
def create_file(file, content): #content - base64 string
print(content)
with open(file, 'wb') as f:
f.write(base64.b64decode(content.encode('utf-8')))
def recv_message(client_socket):
global messages
data = json.loads(client_socket.recv(4096).decode('utf-8').strip()) ##Important here i got this error json.decoder.JSONDecodeError: Unterminated string starting at: line 1 column 67 (char 66)
raddr = get_raddr(str(client_socket))
raddr = f'{raddr[0]}:{raddr[1]}'
message = f'From: {raddr}\nCommand: {data["command"]}\nOutput: \n\n{data["msg"]}'
try:
d = messages[raddr]
d.append(message)
messages[raddr] = d
except KeyError:
messages[raddr] = [message]
except AttributeError:
print(message, messages)
if 'content' in data.keys(): ##Important
print(data['content'])
threading.Thread(target=create_file, args=(data['file'], data['content'],), daemon=False).start()
def get_raddr(string):
'''Get raddr parameter from client socket'''
raddr = string.replace('>', '')
return eval(raddr[raddr.find('raddr')::].replace('raddr=', ''))
def is_manage_string(sub, string):
tokens = shlex.split(string)
try:
if len(tokens) == 2 and tokens[0] == sub and str(int(tokens[-1])):
return True, int(tokens[-1])
except Exception as e:
print(e)
return False
manage_process = False
def manage():
global manage_process
while True:
manage_process = False
command = input('>>> ').strip()
if command == 'list':
try:
for i in range(1, len(connections) + 1):
print(f'{i}\t{connections[i][-1]}')
except KeyError:
pass
if len(connections) == 0:
print('[-] There are not any connections')
elif 'manage' in command:
index = is_manage_string('manage', command)
if index:
index = index[-1]
else:
print('[-] Invalid command\nUse manage "number_of_connection"\nEx: manage 1')
continue
if index >= 1 and index <= len(connections):
sock, addr = connections[index]
print(addr)
print(f'{addr} is used')
while True: ##Important here i launch loop which send data to socket
manage_process = True
command = input('>>> ').strip()
if command == 'messages':
try:
if messages[addr] == list():
print()
continue
except KeyError:
pass
try:
print('\n\n'.join(messages[addr]))
except KeyError:
print()
elif command == 'message':
try:
print(messages[addr][-1])
except:
print()
elif command == 'clear_messages':
try:
if messages[addr]:
messages[addr] = []
except KeyError:
print('[-] There are not any messages for cleaning up')
elif command == 'leave':
print(f'Leaving connection {addr}')
break
elif command: ##Important if command hasn't been detected as my special command(leave, messages), it will be executed like shell command
try:
sock.send(command.encode('utf-8'))
print(
'Your input has not been detected as special command and will execute like shell command or like client special command(ex: download; see client file)')
except ConnectionResetError:
print("Connection has been lost, therefore shell commands can't be used")
else:
continue
else:
print('[-] Invalid number of connection')
elif command:
print('[-] Invalid command\nType "help" to see avalible commands')
##Important
def event_loop():
while True:
data = selector.select()
for key, _ in data:
try:
key.data(key.fileobj)
except ConnectionResetError:
selector.unregister(key.fileobj)
##Important
threading.Thread(target=manage, daemon=True).start()
event_loop()
Client:
import socket
import subprocess
import shlex
import threading
import json
import base64
s = socket.socket()
s.connect(('localhost', 4444))
##Important
def read(file):
with open(file, 'rb') as f:
return base64.b64encode(f.read())
def runner(data):
sp_data = shlex.split(data)
try:
print(sp_data)
if len(sp_data) == 2 and sp_data[0] == 'download': ###Important here we create json object which will be send to socketserver
try:
content = read(sp_data[-1]).decode('utf-8')
print(content)
msg = json.dumps({'file': sp_data[-1], 'command': data, 'content': content,
'msg': f'[+] File {sp_data[-1]} has been successfully downloaded.'}).encode('utf-8')
except FileNotFoundError:
msg = json.dumps({'msg': f'[-] File {sp_data[-1]} not found', 'command': data}).encode('utf-8')
s.send(msg)
return ''
except Exception as e:
print(e)
command = subprocess.run(data, shell=True, encoding='cp866', text=True, capture_output=True)
command = command.stderr if command.stderr else command.stdout
command = json.dumps({'msg': command, 'command': data})
s.send(command.encode('utf-8'))
while True:##Important
data = s.recv(4096).decode('utf-8').strip()
threading.Thread(target=runner, args=(data,)).start()
import socket
import struct
class Socket(socket.socket):
def __init__(self):
self.sock = socket.socket()
super().__init__(socket.AF_INET, socket.SOCK_STREAM)
def send_msg(self, msg):
# Prefix each message with a 4-byte length (network byte order)
msg = struct.pack('>I', len(msg)) + msg
self.sock.sendall(msg)
def recv_msg(self):
# Read message length and unpack it into an integer
raw_msglen = self.recv_all(4)
if not raw_msglen:
return None
msglen = struct.unpack('>I', raw_msglen)[0]
# Read the message data
return self.recv_all(msglen)
def recv_all(self, n):
data = bytearray()
while len(data) < n:
packet = self.sock.recv(n - len(data))
if not packet:
return None
data.extend(packet)
return data
I reshaped ur code into Socket class.
Solution - usage these features:
def send_msg(sock, msg):
# Prefix each message with a 4-byte length (network byte order)
msg = struct.pack('>I', len(msg)) + msg
sock.sendall(msg)
def recv_msg(sock):
# Read message length and unpack it into an integer
raw_msglen = recvall(sock, 4)
if not raw_msglen:
return None
msglen = struct.unpack('>I', raw_msglen)[0]
# Read the message data
return recvall(sock, msglen)
def recvall(sock, n):
# Helper function to recv n bytes or return None if EOF is hit
data = bytearray()
while len(data) < n:
packet = sock.recv(n - len(data))
if not packet:
return None
data.extend(packet)
return data
I am using https://github.com/brendano/stanford_corenlp_pywrapper. It has a sock.py script for communcation which works for python2.
from __future__ import division
import subprocess, tempfile, time, os, logging, re, struct, socket, atexit, glob, itertools
from copy import copy,deepcopy
from pprint import pprint
try:
import ujson as json
except ImportError:
import json
# SUGGESTED: for constituent parsing models, specify shift-reduce parser in
# configdict with:
# 'parse.model': 'edu/stanford/nlp/models/srparser/englishSR.ser.gz'
MODES_items = [
('ssplit', {'annotators': "tokenize, ssplit",
'description': "tokenization and sentence splitting (included in all subsequent ones)", }),
('pos', {'annotators':"tokenize, ssplit, pos, lemma",
'description':"POS (and lemmas)",}),
('ner', {'annotators':"tokenize, ssplit, pos, lemma, ner, entitymentions",
'description':"POS and NER (and lemmas)",}),
('parse', {'annotators':"tokenize, ssplit, pos, lemma, parse",
'description':"fairly basic parsing with POS, lemmas, trees, dependencies",}),
('nerparse', {'annotators':"tokenize, ssplit, pos, lemma, ner, entitymentions, parse",
'description':"parsing with NER, POS, lemmas, depenencies."}),
('coref', {'annotators':"tokenize, ssplit, pos, lemma, ner, entitymentions, parse, dcoref",
'description':"Coreference, including constituent parsing."})
]
MODES = dict(MODES_items)
logging.basicConfig() # wtf, why we have to call this?
LOG = logging.getLogger("CoreNLP_PyWrapper")
LOG.setLevel("INFO")
# LOG.setLevel("DEBUG")
PARSEDOC_TIMEOUT_SEC = 60 * 5
STARTUP_BUSY_WAIT_INTERVAL_SEC = 1.0
def command(mode=None, configfile=None, configdict=None, comm_mode=None,
java_command="java",
java_options="-Xmx4g -XX:ParallelGCThreads=1",
**kwargs):
d = {}
d.update(**locals())
d.update(**kwargs)
more_config = ""
if mode is None and configfile is None and configdict is None:
assert False, "Need to set mode, or the annotators directly, for this wrapper to work."
if mode:
if configdict is not None:
assert 'annotators' not in configdict, "mode was given but annotators are set in the configdict. use only one please."
elif configdict is None:
configdict = {}
LOG.info("mode given as '%s' so setting annotators: %s" % (mode, MODES[mode]['annotators']))
configdict['annotators'] = MODES[mode]['annotators']
if configfile:
more_config += " --configfile {}".format(configfile)
if configdict:
j = json.dumps(configdict)
assert "'" not in j, "can't handle single quote in config values"
more_config += " --configdict '{}'".format(j)
d['more_config'] = more_config
if comm_mode=='SOCKET':
d['comm_info'] = "--server {server_port}".format(**d)
elif comm_mode=='PIPE':
d['comm_info'] = "--outpipe {outpipe}".format(**d)
else: assert False, "need comm_mode to be SOCKET or PIPE but got " + repr(comm_mode)
cmd = """exec {java_command} {java_options} -cp '{classpath}'
corenlp.SocketServer {comm_info} {more_config}"""
return cmd.format(**d).replace("\n", " ")
class SubprocessCrashed(Exception):
pass
class CoreNLP:
def __init__(self, mode=None,
configfile=None, configdict=None,
corenlp_jars=(
"/home/sw/corenlp/stanford-corenlp-full-2015-04-20/*",
"/home/sw/stanford-srparser-2014-10-23-models.jar",
),
comm_mode='PIPE', # SOCKET or PIPE
server_port=12340, outpipe_filename_prefix="/tmp/corenlp_pywrap_pipe",
**more_configdict_args
):
self.mode = mode
self.proc = None
self.server_port = server_port
self.configfile = configfile
self.comm_mode = comm_mode
self.outpipe = None
self.configdict = deepcopy(configdict)
if not self.configdict: self.configdict = {}
self.configdict.update(more_configdict_args)
if not self.configdict: self.configdict = None
if self.comm_mode=='PIPE':
tag = "pypid=%d_time=%s" % (os.getpid(), time.time())
self.outpipe = "%s_%s" % (outpipe_filename_prefix, tag)
assert not os.path.exists(self.outpipe)
assert isinstance(corenlp_jars, (list,tuple))
deglobbed = itertools.chain(*[glob.glob(f) for f in corenlp_jars])
assert any(os.path.exists(f) for f in deglobbed), "CoreNLP jar files don't seem to exist; are the paths correct? Searched files: %s" % repr(deglobbed)
local_libdir = os.path.join(os.path.abspath(os.path.dirname(__file__)),
'lib')
jars = [os.path.join(local_libdir, "*")]
jars += corenlp_jars
self.classpath = ':'.join(jars)
# self.classpath += ":../bin:bin" ## for eclipse java dev
# LOG.info("CLASSPATH: " + self.classpath)
self.start_server()
# This probably is only half-reliable, but worth a shot.
atexit.register(self.cleanup)
def cleanup(self):
self.kill_proc_if_running()
if self.outpipe and os.path.exists(self.outpipe):
os.unlink(self.outpipe)
def __del__(self):
# This is also an unreliable way to ensure the subproc is gone, but
# might as well try
self.cleanup()
def start_server(self):
self.kill_proc_if_running()
if self.comm_mode=='PIPE':
if not os.path.exists(self.outpipe):
os.mkfifo(self.outpipe)
cmd = command(**self.__dict__)
LOG.info("Starting java subprocess, and waiting for signal it's ready, with command: %s" % cmd)
self.proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
time.sleep(STARTUP_BUSY_WAIT_INTERVAL_SEC)
if self.comm_mode=='SOCKET':
sock = self.get_socket(num_retries=100, retry_interval=STARTUP_BUSY_WAIT_INTERVAL_SEC)
sock.close()
elif self.comm_mode=='PIPE':
self.outpipe_fp = open(self.outpipe, 'r')
while True:
# This loop is for if you have timeouts for the socket connection
# The pipe system doesn't have timeouts, so this should run only
# once in that case.
try:
ret = self.send_command_and_parse_result('PING\t""', 2)
if ret is None:
continue
assert ret == "PONG", "Bad return data on startup ping: " + ret
LOG.info("Successful ping. The server has started.")
break
except socket.error, e:
LOG.info("Waiting for startup: ping got exception: %s %s" % (type(e), e))
LOG.info("pausing before retry")
time.sleep(STARTUP_BUSY_WAIT_INTERVAL_SEC)
LOG.info("Subprocess is ready.")
def ensure_proc_is_running(self):
if self.proc is None:
# Has never been started
self.start_server()
elif self.proc.poll() is not None:
# Restart
self.start_server()
def kill_proc_if_running(self):
if self.proc is None:
# it's never been started yet
return
retcode = self.proc.poll()
if retcode is not None:
LOG.info("Subprocess seems to be stopped, exit code %s" % retcode)
elif retcode is None:
LOG.warning("Killing subprocess %s" % self.proc.pid)
os.kill(self.proc.pid, 9)
def parse_doc(self, text, timeout=PARSEDOC_TIMEOUT_SEC, raw=False):
cmd = "PARSEDOC\t%s" % json.dumps(text)
return self.send_command_and_parse_result(cmd, timeout, raw=raw)
def get_socket(self, num_retries=1, retry_interval=1):
# could be smarter here about reusing the same socket?
for trial in range(num_retries):
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # not sure if this is needed?
sock.connect(('localhost', self.server_port))
return sock
except (socket.error, socket.timeout) as e:
LOG.info("socket error when making connection (%s)" % e)
if trial < num_retries-1:
LOG.info("pausing before retry")
time.sleep(retry_interval)
assert False, "couldnt connect socket"
def send_command_and_parse_result(self, cmd, timeout, raw=False):
try:
self.ensure_proc_is_running()
data = self.send_command_and_get_string_result(cmd, timeout)
if data is None: return None
decoded = None
if raw:
return data
try:
decoded = json.loads(data)
except ValueError:
LOG.warning("Bad JSON returned from subprocess; returning null.")
LOG.warning("Bad JSON length %d, starts with: %s" % (len(data), repr(data[:1000])))
return None
return decoded
except socket.timeout, e:
LOG.info("Socket timeout happened, returning None: %s %s" % (type(e), e))
return None
# This is tricky. maybe the process is running smoothly but just
# taking longer than we like. if it's in thie state, and we try to
# send another command, what happens? Should we forcibly restart
# the process now just in case?
def send_command_and_get_string_result(self, cmd, timeout):
if self.comm_mode == 'SOCKET':
sock = self.get_socket(num_retries=100)
sock.settimeout(timeout)
sock.sendall(cmd + "\n")
size_info_str = sock.recv(8)
elif self.comm_mode == 'PIPE':
self.proc.stdin.write(cmd + "\n")
self.proc.stdin.flush()
size_info_str = self.outpipe_fp.read(8)
# java "long" is 8 bytes, which python struct calls "long long".
# java default byte ordering is big-endian.
size_info = struct.unpack('>Q', size_info_str)[0]
# print "size expected", size_info
chunks = []
curlen = lambda: sum(len(x) for x in chunks)
while True:
remaining_size = size_info - curlen()
if self.comm_mode == 'SOCKET':
data = sock.recv(remaining_size)
elif self.comm_mode == 'PIPE':
data = self.outpipe_fp.read(remaining_size)
chunks.append(data)
if curlen() >= size_info: break
if len(chunks) > 1000:
LOG.warning("Incomplete value from server")
return None
time.sleep(0.01)
return ''.join(chunks)
def test_modes():
import pytest
gosimple(comm_mode='SOCKET')
gosimple(comm_mode='PIPE')
with pytest.raises(AssertionError):
gosimple(comm_mode=None)
with pytest.raises(AssertionError):
gosimple(comm_mode='asdfasdf')
def test_coref():
assert_no_java("no java when starting")
p = CoreNLP("coref")
ret = p.parse_doc("I saw Fred. He saw me.")
pprint(ret)
assert 'entities' in ret
assert isinstance(ret['entities'], list)
def gosimple(**kwargs):
assert_no_java("no java when starting")
p = CoreNLP("ssplit", **kwargs)
ret = p.parse_doc("Hello world.")
# pprint(ret)
assert len(ret['sentences']) == 1
assert u' '.join(ret['sentences'][0]['tokens']) == u"Hello world ."
p.kill_proc_if_running()
assert_no_java()
def test_paths():
import pytest
with pytest.raises(AssertionError):
CoreNLP("ssplit", corenlp_jars=["/asdfadsf/asdfasdf"])
def assert_no_java(msg=""):
ps_output = os.popen("ps wux").readlines()
javalines = [x for x in ps_output if re.search(r'\bbin/java\b', x)]
print ''.join(javalines)
assert len(javalines) == 0, msg
# def test_doctimeout():
# assert_no_java("no java when starting")
#
# p = CoreNLP("pos")
# ret = p.parse_doc(open("allbrown.txt").read(), 0.5)
# assert ret is None
# p.kill_proc_if_running()
# assert_no_java()
if __name__=='__main__':
import sys
if sys.argv[1]=='modes':
for mode,d in MODES_items:
print " * `%s`: %s" % (mode, d['description'])
if sys.argv[1]=='modes_json':
# import json as stdjson
# print stdjson.dumps(MODES, indent=4)
print '"%s"' % json.dumps(MODES).replace('"', r'\"')
I changed it to python3 using 2to3 script which addded brackets around print and changed exception handling. Updated code
"""
Client and process monitor for the java socket server.
"""
import subprocess, tempfile, time, os, logging, re, struct, socket, atexit, glob, itertools
from copy import copy,deepcopy
from pprint import pprint
try:
import ujson as json
except ImportError:
import json
# SUGGESTED: for constituent parsing models, specify shift-reduce parser in
# configdict with:
# 'parse.model': 'edu/stanford/nlp/models/srparser/englishSR.ser.gz'
MODES_items = [
('ssplit', {'annotators': "tokenize, ssplit",
'description': "tokenization and sentence splitting (included in all subsequent ones)", }),
('pos', {'annotators':"tokenize, ssplit, pos, lemma",
'description':"POS (and lemmas)",}),
('ner', {'annotators':"tokenize, ssplit, pos, lemma, ner, entitymentions",
'description':"POS and NER (and lemmas)",}),
('parse', {'annotators':"tokenize, ssplit, pos, lemma, parse",
'description':"fairly basic parsing with POS, lemmas, trees, dependencies",}),
('nerparse', {'annotators':"tokenize, ssplit, pos, lemma, ner, entitymentions, parse",
'description':"parsing with NER, POS, lemmas, depenencies."}),
('coref', {'annotators':"tokenize, ssplit, pos, lemma, ner, entitymentions, parse, dcoref",
'description':"Coreference, including constituent parsing."})
]
MODES = dict(MODES_items)
logging.basicConfig() # wtf, why we have to call this?
LOG = logging.getLogger("CoreNLP_PyWrapper")
LOG.setLevel("INFO")
# LOG.setLevel("DEBUG")
PARSEDOC_TIMEOUT_SEC = 60 * 5
STARTUP_BUSY_WAIT_INTERVAL_SEC = 1.0
def command(mode=None, configfile=None, configdict=None, comm_mode=None,
java_command="java",
java_options="-Xmx4g -XX:ParallelGCThreads=1",
**kwargs):
d = {}
d.update(**locals())
d.update(**kwargs)
more_config = ""
if mode is None and configfile is None and configdict is None:
assert False, "Need to set mode, or the annotators directly, for this wrapper to work."
if mode:
if configdict is not None:
assert 'annotators' not in configdict, "mode was given but annotators are set in the configdict. use only one please."
elif configdict is None:
configdict = {}
LOG.info("mode given as '%s' so setting annotators: %s" % (mode, MODES[mode]['annotators']))
configdict['annotators'] = MODES[mode]['annotators']
if configfile:
more_config += " --configfile {}".format(configfile)
if configdict:
j = json.dumps(configdict)
assert "'" not in j, "can't handle single quote in config values"
more_config += " --configdict '{}'".format(j)
d['more_config'] = more_config
if comm_mode=='SOCKET':
d['comm_info'] = "--server {server_port}".format(**d)
elif comm_mode=='PIPE':
d['comm_info'] = "--outpipe {outpipe}".format(**d)
else: assert False, "need comm_mode to be SOCKET or PIPE but got " + repr(comm_mode)
cmd = """exec {java_command} {java_options} -cp '{classpath}'
corenlp.SocketServer {comm_info} {more_config}"""
return cmd.format(**d).replace("\n", " ")
class SubprocessCrashed(Exception):
pass
class CoreNLP:
def __init__(self, mode=None,
configfile=None, configdict=None,
corenlp_jars=(
"/home/sw/corenlp/stanford-corenlp-full-2015-04-20/*",
"/home/sw/stanford-srparser-2014-10-23-models.jar",
),
comm_mode='PIPE', # SOCKET or PIPE
server_port=12340, outpipe_filename_prefix="/tmp/corenlp_pywrap_pipe",
**more_configdict_args
):
self.mode = mode
self.proc = None
self.server_port = server_port
self.configfile = configfile
self.comm_mode = comm_mode
self.outpipe = None
self.configdict = deepcopy(configdict)
if not self.configdict: self.configdict = {}
self.configdict.update(more_configdict_args)
if not self.configdict: self.configdict = None
if self.comm_mode=='PIPE':
tag = "pypid=%d_time=%s" % (os.getpid(), time.time())
self.outpipe = "%s_%s" % (outpipe_filename_prefix, tag)
assert not os.path.exists(self.outpipe)
assert isinstance(corenlp_jars, (list,tuple))
deglobbed = itertools.chain(*[glob.glob(f) for f in corenlp_jars])
assert any(os.path.exists(f) for f in deglobbed), "CoreNLP jar files don't seem to exist; are the paths correct? Searched files: %s" % repr(deglobbed)
local_libdir = os.path.join(os.path.abspath(os.path.dirname(__file__)),
'lib')
jars = [os.path.join(local_libdir, "*")]
jars += corenlp_jars
self.classpath = ':'.join(jars)
# self.classpath += ":../bin:bin" ## for eclipse java dev
# LOG.info("CLASSPATH: " + self.classpath)
self.start_server()
# This probably is only half-reliable, but worth a shot.
atexit.register(self.cleanup)
def cleanup(self):
self.kill_proc_if_running()
if self.outpipe and os.path.exists(self.outpipe):
os.unlink(self.outpipe)
def __del__(self):
# This is also an unreliable way to ensure the subproc is gone, but
# might as well try
self.cleanup()
def start_server(self):
self.kill_proc_if_running()
if self.comm_mode=='PIPE':
if not os.path.exists(self.outpipe):
os.mkfifo(self.outpipe)
cmd = command(**self.__dict__)
LOG.info("Starting java subprocess, and waiting for signal it's ready, with command: %s" % cmd)
self.proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
time.sleep(STARTUP_BUSY_WAIT_INTERVAL_SEC)
if self.comm_mode=='SOCKET':
sock = self.get_socket(num_retries=100, retry_interval=STARTUP_BUSY_WAIT_INTERVAL_SEC)
sock.close()
elif self.comm_mode=='PIPE':
self.outpipe_fp = open(self.outpipe, 'r')
while True:
# This loop is for if you have timeouts for the socket connection
# The pipe system doesn't have timeouts, so this should run only
# once in that case.
try:
ret = self.send_command_and_parse_result('PING\t""', 2)
if ret is None:
continue
assert ret == "PONG", "Bad return data on startup ping: " + ret
LOG.info("Successful ping. The server has started.")
break
except socket.error as e:
LOG.info("Waiting for startup: ping got exception: %s %s" % (type(e), e))
LOG.info("pausing before retry")
time.sleep(STARTUP_BUSY_WAIT_INTERVAL_SEC)
LOG.info("Subprocess is ready.")
def ensure_proc_is_running(self):
if self.proc is None:
# Has never been started
self.start_server()
elif self.proc.poll() is not None:
# Restart
self.start_server()
def kill_proc_if_running(self):
if self.proc is None:
# it's never been started yet
return
retcode = self.proc.poll()
if retcode is not None:
LOG.info("Subprocess seems to be stopped, exit code %s" % retcode)
elif retcode is None:
LOG.warning("Killing subprocess %s" % self.proc.pid)
os.kill(self.proc.pid, 9)
def parse_doc(self, text, timeout=PARSEDOC_TIMEOUT_SEC, raw=False):
cmd = "PARSEDOC\t%s" % json.dumps(text)
return self.send_command_and_parse_result(cmd, timeout, raw=raw)
def get_socket(self, num_retries=1, retry_interval=1):
# could be smarter here about reusing the same socket?
for trial in range(num_retries):
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # not sure if this is needed?
sock.connect(('localhost', self.server_port))
return sock
except (socket.error, socket.timeout) as e:
LOG.info("socket error when making connection (%s)" % e)
if trial < num_retries-1:
LOG.info("pausing before retry")
time.sleep(retry_interval)
assert False, "couldnt connect socket"
def send_command_and_parse_result(self, cmd, timeout, raw=False):
try:
self.ensure_proc_is_running()
data = self.send_command_and_get_string_result(cmd, timeout)
if data is None: return None
decoded = None
if raw:
return data
try:
decoded = json.loads(data)
except ValueError:
LOG.warning("Bad JSON returned from subprocess; returning null.")
LOG.warning("Bad JSON length %d, starts with: %s" % (len(data), repr(data[:1000])))
return None
return decoded
except socket.timeout as e:
LOG.info("Socket timeout happened, returning None: %s %s" % (type(e), e))
return None
# This is tricky. maybe the process is running smoothly but just
# taking longer than we like. if it's in thie state, and we try to
# send another command, what happens? Should we forcibly restart
# the process now just in case?
def send_command_and_get_string_result(self, cmd, timeout):
if self.comm_mode == 'SOCKET':
sock = self.get_socket(num_retries=100)
sock.settimeout(timeout)
sock.sendall(cmd + "\n")
size_info_str = sock.recv(8)
elif self.comm_mode == 'PIPE':
self.proc.stdin.write(cmd + "\n")
self.proc.stdin.flush()
size_info_str = self.outpipe_fp.read(8)
# java "long" is 8 bytes, which python struct calls "long long".
# java default byte ordering is big-endian.
size_info = struct.unpack('>Q', size_info_str)[0]
# print "size expected", size_info
chunks = []
curlen = lambda: sum(len(x) for x in chunks)
while True:
remaining_size = size_info - curlen()
if self.comm_mode == 'SOCKET':
data = sock.recv(remaining_size)
elif self.comm_mode == 'PIPE':
data = self.outpipe_fp.read(remaining_size)
chunks.append(data)
if curlen() >= size_info: break
if len(chunks) > 1000:
LOG.warning("Incomplete value from server")
return None
time.sleep(0.01)
return ''.join(chunks)
def test_modes():
import pytest
gosimple(comm_mode='SOCKET')
gosimple(comm_mode='PIPE')
with pytest.raises(AssertionError):
gosimple(comm_mode=None)
with pytest.raises(AssertionError):
gosimple(comm_mode='asdfasdf')
def test_coref():
assert_no_java("no java when starting")
p = CoreNLP("coref")
ret = p.parse_doc("I saw Fred. He saw me.")
pprint(ret)
assert 'entities' in ret
assert isinstance(ret['entities'], list)
def gosimple(**kwargs):
assert_no_java("no java when starting")
p = CoreNLP("ssplit", **kwargs)
ret = p.parse_doc("Hello world.")
# pprint(ret)
assert len(ret['sentences']) == 1
assert ' '.join(ret['sentences'][0]['tokens']) == "Hello world ."
p.kill_proc_if_running()
assert_no_java()
def test_paths():
import pytest
with pytest.raises(AssertionError):
CoreNLP("ssplit", corenlp_jars=["/asdfadsf/asdfasdf"])
def assert_no_java(msg=""):
ps_output = os.popen("ps wux").readlines()
javalines = [x for x in ps_output if re.search(r'\bbin/java\b', x)]
print(''.join(javalines))
assert len(javalines) == 0, msg
# def test_doctimeout():
# assert_no_java("no java when starting")
#
# p = CoreNLP("pos")
# ret = p.parse_doc(open("allbrown.txt").read(), 0.5)
# assert ret is None
# p.kill_proc_if_running()
# assert_no_java()
if __name__=='__main__':
import sys
if sys.argv[1]=='modes':
for mode,d in MODES_items:
print(" * `%s`: %s" % (mode, d['description']))
if sys.argv[1]=='modes_json':
# import json as stdjson
# print stdjson.dumps(MODES, indent=4)
print('"%s"' % json.dumps(MODES).replace('"', r'\"'))
However, I still get an error at
byteself.proc.stdin.write(cmd + "\n")
TypeError: a bytes-like object is required, not 'str'
I know it has to do with encoding and I may have to use encode/decode at a few places where it reads/writes but since I have not worked with sockets a lot, I am not sure where to change.