Python fast static file serving - python

What's the fastest way to serve static files in Python? I'm looking for something equal or close enough to Nginx's static file serving.
I know of SimpleHTTPServer but not sure if it can handle serving multiple files efficiently and reliably.
Also, I don't mind it being a part of a lib/framework of some sort as long as its lib/framework is lightweight.

EDIT: This project appears to be dead.
What about FAPWS3? One of the selling points:
Static file server
FAPWS can be used to serve a huge amount of static file requests. With the help of a async database in the backend, you can use FAPWS as your own Amazon S3.

If you look for a oneliner you can do the following:
$> python -m SimpleHTTPServer
This will not fullfil all the task required but worth mentioning that this is the simplest way :-)

I would highly recommend using a 3rd party HTTP server to serve static files.
Servers like nginx are heavily optimized for the task at hand, parallelized and written in fast languages.
Python is tied to one processor and interpreted.

Original SimpleHTTPServer from python standard library does NOT "handle serving multiple files efficiently and reliably". For instance, if you are downloading one file from it, another HTTP access to it must be hovering since SimpleHTTPServer.py is a simple singal-thread HTTP server which could only support one connecting simultaneously.
Fortunately, note that SimpleHTTPServer.py use BaseHTTPServer.HTTPServer as handler, which can be wrapped by SocketServer.ForkingMixIn and SocketServer.ThreadingMixIn also from python standard library to support multi-process and multi-thread mode, which could highly enhance simple HTTP server's "efficience and reliability".
According to this idea, a SimpleHTTPServer with multi-thread/multi-process support modified from original one is given as follows:
$ python2.7 ModifiedSimpleHTTPServer.py
usage: ModifiedSimpleHTTPServer.py [-h] [--pydoc] [--port PORT]
[--type {process,thread}] [--root ROOT]
[--run]
Modified SimpleHTTPServer with MultiThread/MultiProcess and IP bind support.
Original: https://docs.python.org/2.7/library/simplehttpserver.html
Modified by: vbem#163.com
optional arguments:
-h, --help show this help message and exit
--pydoc show this module's pydoc
run arguments:
--port PORT specify server port (default: 8000)
--type {process,thread}
specify server type (default: 'thread')
--root ROOT specify root directory (default: cwd '/home/vbem')
--run run http server foreground
NOTE: stdin for input, stdout for result, stderr for logging
For example, ModifiedSimpleHTTPServer.py --run --root /var/log --type process will run a multi-process HTTP static files server with '/var/log' as its root directory.
Modified codes are:
#! /usr/bin/env python2.7
# -*- coding: utf-8 -*-
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
r"""Modified SimpleHTTPServer with MultiThread/MultiProcess and IP bind support.
Original: https://docs.python.org/2.7/library/simplehttpserver.html
Modified by: vbem#163.com
"""
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
import os, sys, pwd, posixpath, BaseHTTPServer, urllib, cgi, shutil, mimetypes, socket, SocketServer, BaseHTTPServer
from cStringIO import StringIO
USERNAME = pwd.getpwuid(os.getuid()).pw_name
HOSTNAME = socket.gethostname()
PORT_DFT = 8000
class SimpleHTTPRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
server_version = "SimpleHTTP/0.6"
def do_GET(self):
f = self.send_head()
if f:
self.copyfile(f, self.wfile)
f.close()
def do_HEAD(self):
f = self.send_head()
if f:
f.close()
def send_head(self):
path = self.translate_path(self.path)
f = None
if os.path.isdir(path):
if not self.path.endswith('/'):
self.send_response(301)
self.send_header("Location", self.path + "/")
self.end_headers()
return None
for index in "index.html", "index.htm":
index = os.path.join(path, index)
if os.path.exists(index):
path = index
break
else:
return self.list_directory(path)
ctype = self.guess_type(path)
try:
f = open(path, 'rb')
except IOError:
self.send_error(404, "File not found")
return None
self.send_response(200)
self.send_header("Content-type", ctype)
fs = os.fstat(f.fileno())
self.send_header("Content-Length", str(fs[6]))
self.send_header("Last-Modified", self.date_time_string(fs.st_mtime))
self.end_headers()
return f
def list_directory(self, path):
try:
list = ['..'] + os.listdir(path) #
except os.error:
self.send_error(404, "No permission to list directory")
return None
list.sort(key=lambda a: a.lower())
f = StringIO()
displaypath = cgi.escape(urllib.unquote(self.path))
f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">')
f.write("<html>\n<title>%s %s</title>\n<body>" % (HOSTNAME, displaypath))
f.write("%s#%s:<strong>%s</strong>\n" % (USERNAME, HOSTNAME, path.rstrip('/')+'/'))
f.write("<hr>\n<ul>\n")
for name in list:
fullname = os.path.join(path, name)
displayname = linkname = name
if os.path.isdir(fullname):
displayname = name + "/"
linkname = name + "/"
if os.path.islink(fullname):
displayname = name + "#"
f.write('<li>%s\n'
% (urllib.quote(linkname), cgi.escape(displayname)))
f.write("</ul>\n<hr>\n<pre>%s</pre>\n</body>\n</html>\n" % __doc__)
length = f.tell()
f.seek(0)
self.send_response(200)
encoding = sys.getfilesystemencoding()
self.send_header("Content-type", "text/html; charset=%s" % encoding)
self.send_header("Content-Length", str(length))
self.end_headers()
return f
def translate_path(self, path):
path = path.split('?',1)[0]
path = path.split('#',1)[0]
path = posixpath.normpath(urllib.unquote(path))
words = path.split('/')
words = filter(None, words)
path = os.getcwd()
for word in words:
drive, word = os.path.splitdrive(word)
head, word = os.path.split(word)
if word in (os.curdir, os.pardir): continue
path = os.path.join(path, word)
return path
def copyfile(self, source, outputfile):
shutil.copyfileobj(source, outputfile)
def guess_type(self, path):
base, ext = posixpath.splitext(path)
if ext in self.extensions_map:
return self.extensions_map[ext]
ext = ext.lower()
if ext in self.extensions_map:
return self.extensions_map[ext]
else:
return self.extensions_map['']
if not mimetypes.inited:
mimetypes.init()
extensions_map = mimetypes.types_map.copy()
extensions_map.update({'': 'text/plain'})
class ProcessedHTTPServer(SocketServer.ForkingMixIn, BaseHTTPServer.HTTPServer):
r"""Handle requests in multi process."""
class ThreadedHTTPServer(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
r"""Handle requests in a separate thread."""
SERVER_DICT = {
'thread' : ThreadedHTTPServer,
'process' : ProcessedHTTPServer,
}
SERVER_DFT = 'thread'
def run(sCwd=None, sServer=SERVER_DFT, nPort=PORT_DFT, *lArgs, **dArgs):
r"""
"""
sys.stderr.write('start with %r\n' % sys._getframe().f_locals)
if sCwd is not None:
os.chdir(sCwd)
cServer = SERVER_DICT[sServer]
oHttpd = cServer(("", nPort), SimpleHTTPRequestHandler)
sys.stderr.write('http://%s:%s/\n' % (HOSTNAME, nPort))
oHttpd.serve_forever()
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# main
def _main():
r"""Main.
"""
import argparse
oParser = argparse.ArgumentParser(
description = __doc__,
formatter_class = argparse.RawTextHelpFormatter,
epilog = 'NOTE: stdin for input, stdout for result, stderr for logging',
)
oParser.add_argument('--pydoc', action='store_true',
help = "show this module's pydoc",
)
oGroupR = oParser.add_argument_group(title='run arguments', description='')
oGroupR.add_argument('--port', action='store', type=int, default=PORT_DFT,
help = 'specify server port (default: %(default)r)',
)
oGroupR.add_argument('--type', action='store', default=SERVER_DFT, choices=SERVER_DICT.keys(),
help = 'specify server type (default: %(default)r)',
)
oGroupR.add_argument('--root', action='store', default=os.getcwd(),
help = 'specify root directory (default: cwd %(default)r)',
)
oGroupR.add_argument('--run', action='store_true',
help = '\n'.join((
'run http server foreground',
)))
oArgs = oParser.parse_args()
if oArgs.pydoc:
help(os.path.splitext(os.path.basename(__file__))[0])
elif oArgs.run:
return run(sCwd=oArgs.root, sServer=oArgs.type, nPort=oArgs.port)
else:
oParser.print_help()
return 1
return 0
if __name__ == "__main__":
exit(_main())
Meanwhile, the single python file with only 200 lines may satisfy your "in Python" and "lightweight" demands.
Last but not least, this ModifiedSimpleHTTPServer.py may be a "killer app" by hand for temporary use, however, Nginx is advised for long term use.

Related

AIRFLOW : Customise SFTPOperator to download multiple files

I'm trying to customise the SFTOperator take download multiple file from a server. I know that the original SFTPOperator only allow one file at a time.
I copied the same code from source and I twerk by adding a new function called get_xml_from_source(). Please refer the code below:
def get_xml_from_source(sftp_client, remote_filepath, local_filepath, prev_execution_date, execution_date):
"""
Copy from Source to local path
"""
files_attr = sftp_client.listdir_attr(remote_filepath) # eg: /source/ HITTING ERROR HERE
files_name = sftp_client.listdir(remote_filepath) # eg: /source/
today_midnight = datetime.combine(datetime.today(), time.min)
yesterday_midnight = today_midnight - timedelta(days=1)
for file_attr, file_name in zip(files_attr, files_name):
modified_time = datetime.fromtimestamp(file_attr.st_mtime)
if yesterday_midnight <= modified_time < today_midnight:
# if prev_execution_date <= modified_time < execution_date:
try:
# Download to local path
sftp_client.get(remote_filepath, local_filepath)
print(file_name)
except: # pylint: disable=bare-except
print("File not found")
else:
print("Not the file!")
Where this function will only download files from yesterday up to today.
I added the function at this line:
with self.ssh_hook.get_conn() as ssh_client:
sftp_client = ssh_client.open_sftp()
if self.operation.lower() == SFTPOperation.GET:
local_folder = os.path.dirname(self.local_filepath)
if self.create_intermediate_dirs:
# Create Intermediate Directories if it doesn't exist
try:
os.makedirs(local_folder)
except OSError:
if not os.path.isdir(local_folder):
raise
file_msg = "from {0} to {1}".format(self.remote_filepath,
self.local_filepath)
self.log.info("Starting to transfer %s", file_msg)
# This is where it starts to copy, customization begins here
# sftp_client.get(self.remote_filepath, self.local_filepath) <--- Original code that I commented out and replace with mine below
get_xml_from_source(sftp_client, self.remote_filepath,
self.local_filepath, self.prev_execution_date, self.execution_date)
Note that, rest of the codes did not change. It is how it looks like in the source.
I keep hitting error at files_attr = sftp_client.listdir_attr(remote_filepath) with this error:
Error while transferring from /source/ to
/path/to/destination, error: [Errno 2] No such file.
Which obviously meant, it can't find the sftp directory. I tried running the whole function locally, it works fine.
Is there any part of the code that tied the paramiko connection to only get one file? I checked the paramiko connection for SFTPOperator, it should be just fine. In this case, how should I fix it?
This is how I established my connection when running locally :
def connect_to_source():
"""
Get source credentials
:param: None
:return: username & password
"""
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
username, password = get_eet_credentials()
# key = paramiko.RSAKey.from_private_key_file(openssh_key, password=password)
ssh.connect(hostname=SFTP_SERVER, port=SFTP_PORT_NUMBER,
username=username, password=password)
client = ssh.open_sftp()
print("Connection to source success!")
return client
Lastly, below is my airflow task:
def copy_from_source():
"""
Copy XML file from source to local path
"""
return SFTPOperator(
task_id="copy_from_source",
ssh_conn_id="source_conn",
local_filepath=f"{current_dir}/destination",
remote_filepath= "/source/",
prev_execution_date='{{ prev_execution_date }}',
execution_date='{{ execution_date }}', # strftime("%Y-%m-%d %H:%M:%S")
create_intermediate_dirs=True,
operation="get",
dag=dag
)
I'm trying to do something similar to you. I'm not sure what is causing the issues you are facing but this is the updated SFTP Operator I have written that gets multiple files from a server
sftp_get_multiple_files_operator.py
import os
from pathlib import Path
from typing import Any
from airflow.exceptions import AirflowException
from airflow.models import BaseOperator
from airflow.contrib.hooks import SSHHook
class SFTPGetMultipleFilesOperator(BaseOperator):
template_fields = ('local_directory', 'remote_filename_pattern', 'remote_host')
def __init__(
self,
*,
ssh_hook=None,
ssh_conn_id=None,
remote_host=None,
local_directory=None,
remote_filename_pattern=None,
filetype=None,
confirm=True,
create_intermediate_dirs=False,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.ssh_hook = ssh_hook
self.ssh_conn_id = ssh_conn_id
self.remote_host = remote_host
self.local_directory = local_directory
self.filetype = filetype
self.remote_filename_pattern = remote_filename_pattern
self.confirm = confirm
self.create_intermediate_dirs = create_intermediate_dirs
def execute(self, context: Any) -> str:
file_msg = None
try:
if self.ssh_conn_id:
if self.ssh_hook and isinstance(self.ssh_hook, SSHHook):
self.log.info("ssh_conn_id is ignored when ssh_hook is provided.")
else:
self.log.info(
"ssh_hook is not provided or invalid. Trying ssh_conn_id to create SSHHook."
)
self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id)
if not self.ssh_hook:
raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.")
if self.remote_host is not None:
self.log.info(
"remote_host is provided explicitly. "
"It will replace the remote_host which was defined "
"in ssh_hook or predefined in connection of ssh_conn_id."
)
self.ssh_hook.remote_host = self.remote_host
with self.ssh_hook.get_conn() as ssh_client:
sftp_client = ssh_client.open_sftp()
all_files = sftp_client.listdir()
self.log.info(f'Found {len(all_files)} files on server')
timestamp = context['ds_nodash']
filename_pattern = self.remote_filename_pattern + timestamp
# fetch all CSV files for the run date that match the filename pattern
matching_files = [f for f in all_files
if f.find(filename_pattern) != -1]
# if file type is specified filter matching files for the file type
if self.filetype is not None:
matching_files = [filename for filename in matching_files
if filename[-len(self.filetype):] == self.filetype]
self.log.info(f'Found {len(matching_files)} files with name including {filename_pattern}')
local_folder = os.path.dirname(self.local_directory)
if self.create_intermediate_dirs:
Path(local_folder).mkdir(parents=True, exist_ok=True)
for f in matching_files:
self.log.info(f"Starting to transfer from /{f} to {self.local_directory}/{f}")
sftp_client.get(f'/{f}', f'{self.local_directory}/{f}')
except Exception as e:
raise AirflowException(f"Error while transferring {file_msg}, error: {str(e)}")
return self.local_directory
def _make_intermediate_dirs(sftp_client, remote_directory) -> None:
"""
Create all the intermediate directories in a remote host
:param sftp_client: A Paramiko SFTP client.
:param remote_directory: Absolute Path of the directory containing the file
:return:
"""
if remote_directory == '/':
sftp_client.chdir('/')
return
if remote_directory == '':
return
try:
sftp_client.chdir(remote_directory)
except OSError:
dirname, basename = os.path.split(remote_directory.rstrip('/'))
_make_intermediate_dirs(sftp_client, dirname)
sftp_client.mkdir(basename)
sftp_client.chdir(basename)
return
dag.py
sftp_report = SFTPGetMultipleFilesOperator(
task_id=f"sftp_reports_to_gcs",
ssh_conn_id="sftp_connection",
local_directory=f'/opt/airflow/dags/reports',
remote_filename_pattern=f'reportname_', # ds_nodash is added in the operator by accessing Airflow context
create_intermediate_dirs=True,
filetype='.csv'
)

python http server threading via cli

New in version 3.7 supports ThreadingHTTPServer as mentioned in doc
to run from command line we use
python -m http.server
but its still run normal HTTPServer, is there any way to enable via command line.
EDITED:
python 3.7 runs ThreadingHTTPServer by default, no argument necessary
Simple Python 2 HTTP Server with multi-threading and partial-content support
#!/usr/bin/env python2
# Standard library imports.
from SocketServer import ThreadingMixIn
import BaseHTTPServer
import SimpleHTTPServer
import sys
import json
import os
from os.path import (join, exists, dirname, abspath, isabs, sep, walk, splitext,
isdir, basename, expanduser, split, splitdrive)
from os import makedirs, unlink, getcwd, chdir, curdir, pardir, rename, fstat
from shutil import copyfileobj, copytree
import glob
from zipfile import ZipFile
from urlparse import urlparse, parse_qs
from urllib import urlopen, quote, unquote
from posixpath import normpath
from cStringIO import StringIO
import re
import ConfigParser
import cgi
import threading
import socket
import errno
DATA_DIR = getcwd() # join(expanduser('~'), APP_NAME)
class ThreadingHTTPServer(ThreadingMixIn, BaseHTTPServer.HTTPServer):
pass
class RequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
""" Handler to handle POST requests for actions.
"""
serve_path = DATA_DIR
def do_GET(self):
""" Overridden to handle HTTP Range requests. """
self.range_from, self.range_to = self._get_range_header()
if self.range_from is None:
# nothing to do here
return SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self)
print 'range request', self.range_from, self.range_to
f = self.send_range_head()
if f:
self.copy_file_range(f, self.wfile)
f.close()
def copy_file_range(self, in_file, out_file):
""" Copy only the range in self.range_from/to. """
in_file.seek(self.range_from)
# Add 1 because the range is inclusive
bytes_to_copy = 1 + self.range_to - self.range_from
buf_length = 64*1024
bytes_copied = 0
while bytes_copied < bytes_to_copy:
read_buf = in_file.read(min(buf_length, bytes_to_copy-bytes_copied))
if len(read_buf) == 0:
break
out_file.write(read_buf)
bytes_copied += len(read_buf)
return bytes_copied
def send_range_head(self):
"""Common code for GET and HEAD commands.
This sends the response code and MIME headers.
Return value is either a file object (which has to be copied
to the outputfile by the caller unless the command was HEAD,
and must be closed by the caller under all circumstances), or
None, in which case the caller has nothing further to do.
"""
path = self.translate_path(self.path)
f = None
if isdir(path):
if not self.path.endswith('/'):
# redirect browser - doing basically what apache does
self.send_response(301)
self.send_header("Location", self.path + "/")
self.end_headers()
return None
for index in "index.html", "index.htm":
index = join(path, index)
if exists(index):
path = index
break
else:
return self.list_directory(path)
if not exists(path) and path.endswith('/data'):
# FIXME: Handle grits-like query with /data appended to path
# stupid grits
if exists(path[:-5]):
path = path[:-5]
ctype = self.guess_type(path)
try:
# Always read in binary mode. Opening files in text mode may cause
# newline translations, making the actual size of the content
# transmitted *less* than the content-length!
f = open(path, 'rb')
except IOError:
self.send_error(404, "File not found")
return None
if self.range_from is None:
self.send_response(200)
else:
self.send_response(206)
self.send_header("Content-type", ctype)
fs = fstat(f.fileno())
file_size = fs.st_size
if self.range_from is not None:
if self.range_to is None or self.range_to >= file_size:
self.range_to = file_size-1
self.send_header("Content-Range",
"bytes %d-%d/%d" % (self.range_from,
self.range_to,
file_size))
# Add 1 because ranges are inclusive
self.send_header("Content-Length",
(1 + self.range_to - self.range_from))
else:
self.send_header("Content-Length", str(file_size))
self.send_header("Last-Modified", self.date_time_string(fs.st_mtime))
self.end_headers()
return f
def list_directory(self, path):
"""Helper to produce a directory listing (absent index.html).
Return value is either a file object, or None (indicating an
error). In either case, the headers are sent, making the
interface the same as for send_head().
"""
try:
list = os.listdir(path)
except os.error:
self.send_error(404, "No permission to list directory")
return None
list.sort(key=lambda a: a.lower())
f = StringIO()
displaypath = cgi.escape(unquote(self.path))
f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">')
f.write("<html>\n<title>Directory listing for %s</title>\n" % displaypath)
f.write("<body>\n<h2>Directory listing for %s</h2>\n" % displaypath)
f.write("<hr>\n<ul>\n")
for name in list:
fullname = os.path.join(path, name)
displayname = linkname = name
# Append / for directories or # for symbolic links
if os.path.isdir(fullname):
displayname = name + "/"
linkname = name + "/"
if os.path.islink(fullname):
displayname = name + "#"
# Note: a link to a directory displays with # and links with /
f.write('<li>%s\n'
% (quote(linkname), cgi.escape(displayname)))
f.write("</ul>\n<hr>\n</body>\n</html>\n")
length = f.tell()
f.seek(0)
self.send_response(200)
encoding = sys.getfilesystemencoding()
self.send_header("Content-type", "text/html; charset=%s" % encoding)
self.send_header("Content-Length", str(length))
self.end_headers()
return f
def translate_path(self, path):
""" Override to handle redirects.
"""
path = path.split('?',1)[0]
path = path.split('#',1)[0]
path = normpath(unquote(path))
words = path.split('/')
words = filter(None, words)
path = self.serve_path
for word in words:
drive, word = splitdrive(word)
head, word = split(word)
if word in (curdir, pardir): continue
path = join(path, word)
return path
# Private interface ######################################################
def _get_range_header(self):
""" Returns request Range start and end if specified.
If Range header is not specified returns (None, None)
"""
range_header = self.headers.getheader("Range")
if range_header is None:
return (None, None)
if not range_header.startswith("bytes="):
print "Not implemented: parsing header Range: %s" % range_header
return (None, None)
regex = re.compile(r"^bytes=(\d+)\-(\d+)?")
rangething = regex.search(range_header)
if rangething:
from_val = int(rangething.group(1))
if rangething.group(2) is not None:
return (from_val, int(rangething.group(2)))
else:
return (from_val, None)
else:
print 'CANNOT PARSE RANGE HEADER:', range_header
return (None, None)
def get_server(port=8000, next_attempts=0, serve_path=None):
Handler = RequestHandler
if serve_path:
Handler.serve_path = serve_path
while next_attempts >= 0:
try:
httpd = ThreadingHTTPServer(("", port), Handler)
return httpd
except socket.error as e:
if e.errno == errno.EADDRINUSE:
next_attempts -= 1
port += 1
else:
raise
def main(args=None):
if args is None:
args = sys.argv[1:]
PORT = 8000
if len(args)>0:
PORT = int(args[-1])
serve_path = DATA_DIR
if len(args) > 1:
serve_path = abspath(args[-2])
httpd = get_server(port=PORT, serve_path=serve_path)
print "serving at port", PORT
httpd.serve_forever()
if __name__ == "__main__" :
main()

Python tar a directory and symmetric encrypt with gpg

At this point the script works great for a single file. When a directory is given it uses tar to create a singe file which works well, then the tar file is gpg encrypted with a password provided. The gpg works also. The problem is that when you decrypt the gpg file the tar is corrupted every time. I'm trying to find what I'm doing wrong here. Please help.
#!/usr/bin/env python3
# Takes file in does symmetric encryption with the password you provide
# then adds it to a running IPFS(ipfs.io) instance.
#
import os
import argparse
import gnupg
import ipfsapi
import tarfile
# Parse command arguments
parser = argparse.ArgumentParser(description='Encrypt file/directory and add it to IPFS')
parser.add_argument('-i','--input', help='File.txt or Directory', required=True)
parser.add_argument('-p','--password', help='Password to encrypt with', required=True)
args = parser.parse_args()
# Set GPG Home directory
gpg = gnupg.GPG(homedir='')
# Set GPG Encoding
gpg.encoding = 'utf-8'
# Get dataToEncrypt full path
dataToEncrypt = (os.path.abspath(args.input))
# Setup tar filename to end with .zip
tarFile = ("{}.tar".format(dataToEncrypt))
# Setup encrypted filename to end with .gpg
encryptedFile = ("{}.tar.gpg".format(dataToEncrypt))
# Tell module where IPFS instance is located
api = ipfsapi.connect('127.0.0.1', 5001)
def dataTar():
if os.path.isfile(dataToEncrypt):
return
else:
#return
with tarfile.open(tarFile, 'w|') as tar:
tar.add(dataToEncrypt)
tar.close()
def encryptFile():
passphrase = (args.password)
if os.path.isfile(dataToEncrypt):
with open(dataToEncrypt, 'rb') as f:
status = gpg.encrypt(f,
encrypt=False,
symmetric='AES256',
passphrase=passphrase,
armor=False,
output=dataToEncrypt + ".gpg")
else:
with open(tarFile, 'rb') as f:
status = gpg.encrypt(f,
encrypt=False,
symmetric='AES256',
passphrase=passphrase,
armor=False,
output=dataToEncrypt + ".tar.gpg")
print ('ok: ', status.ok)
print ('status: ', status.status)
print ('stderr: ', status.stderr)
def ipfsFile(encryptedFile):
# Add encrypted file to IPFS
ipfsLoadedFile = api.add(encryptedFile, wrap_with_directory=True)
# Return Hash of new IPFS File
fullHash = (ipfsLoadedFile[1])
ipfsHash = fullHash['Hash']
return(ipfsHash)
def delEncryptedFile(encryptedFile):
try:
os.remove(encryptedFile)
except:
print("Error: %s unable to find or delete file." % encryptedFile)
def main():
dataTar()
encryptFile()
#ipfsFile(encryptedFile)
#print ("File encrypted and added to IPFS with this hash " + ipfsFile(encryptedFile))
#delEncryptedFile(encryptedFile)
if __name__ == "__main__":
main()
Code looks fine. I just tried it with https://pypi.org/project/python-gnupg/ and it works fine. I had to fix the API's according to this package, but I don't think that matters. Just diff it to see the changes. I don't see any problem except that you should be using gpg -d file.tar.pgp | tar xvf -.
#!/usr/bin/env python3
# Takes file in does symmetric encryption with the password you provide then
# adds it to a running IPFS (ipfs.io) instance.
import os
import argparse
import gnupg
import tarfile
parser = argparse.ArgumentParser(
description='Encrypt file/directory and add it to IPFS')
parser.add_argument('-i','--input',
help='File.txt or Directory',
required=True)
parser.add_argument('-p','--password',
help='Password to encrypt with',
required=True)
args = parser.parse_args()
gpg = gnupg.GPG()
gpg.encoding = 'utf-8'
dataToEncrypt = (os.path.abspath(args.input))
tarFile = ("{}.tar".format(dataToEncrypt))
encryptedFile = ("{}.tar.gpg".format(dataToEncrypt))
def dataTar():
if os.path.isfile(dataToEncrypt):
return
else:
with tarfile.open(tarFile, 'w|') as tar:
tar.add(dataToEncrypt)
tar.close()
def encryptFile():
passphrase = (args.password)
if os.path.isfile(dataToEncrypt):
with open(dataToEncrypt, 'rb') as f:
status = gpg.encrypt(f.read(),
recipients=None,
symmetric='AES256',
passphrase=passphrase,
armor=False,
output=dataToEncrypt + ".gpg")
else:
with open(tarFile, 'rb') as f:
status = gpg.encrypt(f.read(),
recipients=None,
symmetric='AES256',
passphrase=passphrase,
armor=False,
output=dataToEncrypt + ".tar.gpg")
print ('ok: ', status.ok)
print ('status: ', status.status)
print ('stderr: ', status.stderr)
def ipfsFile(encryptedFile):
ipfsLoadedFile = api.add(encryptedFile, wrap_with_directory=True)
fullHash = (ipfsLoadedFile[1])
ipfsHash = fullHash['Hash']
return(ipfsHash)
def delEncryptedFile(encryptedFile):
try:
os.remove(encryptedFile)
except:
print("Error: %s unable to find or delete file." % encryptedFile)
def main():
dataTar()
encryptFile()
if __name__ == "__main__":
main()

Multiprocessing Python Bottle Multiple Commands independent of a running pool.apply_async

The Problem, I made a rest api out of bottle to start Elasticsearch bulk loads. The bulk load process runs inside of a multiprocess pool, the problem is while that is running the api wont except any other commands.
I've even tried running another instance of the api on a different port but I have stack question about that too. The second one doesn't accept commands.
I want to be able to call a different api command to get the status of the load and return it. Currently it just includes ES data, but eventually its going to include each nodes stats. This is designed to run from Jenkins and initiate parallel loads.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright [current year] the Melange authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import RequestError
from subprocess import Popen, PIPE
from multiprocessing import Pool, Process, pool
from datetime import datetime
import boto3
import sys
import os
import argparse
import logging
import logging.config
from bottle import route, run
from boto.cloudformation.stack import Output
import json
#this is what is called to set up the loading process from the api.
def start_load(secret, access, protocol, host, ports, index, type, mapping, data,threads):
# decompress a gzip string
def decompress_gzip(data):
return Popen(['zcat'], stdout=PIPE, stdin=PIPE).communicate(input=data)[0]
# parse an s3 path into a bucket and key 's3://my-bucket/path/to/data' -> ('my-bucket', 'path/to/data')
def parse_s3_path(str):
_, _, bucket, key = str.split('/', 3)
return (bucket, key)
def shell_command_execute(command):
p = Popen(command, stdout=PIPE, shell=True)
(output, err) = p.communicate()
return output
# load an S3 file to elasticsearch
def load_s3_file(s3_bucket, s3_key, es_host, es_port, es_index, es_type, access, secret):
try:
logging.info('loading s3://%s/%s', s3_bucket, s3_key)
s3 = boto3.client('s3', aws_access_key_id=access, aws_secret_access_key=secret)
file_handle = s3.get_object(Bucket=s3_bucket, Key=s3_key)
file_contents = file_handle['Body'].read()
logging.info('%s'%s3_key)
if file_contents:
if s3_key.endswith('.gz'):
file_contents = decompress_gzip(file_contents)
es = Elasticsearch(host=es_host, port=es_port, timeout=180)
es.bulk(body=file_contents, index=es_index, doc_type=es_type, timeout=120)
except Exception as e:
logging.error("There has been a major error %s" % e)
# load an S3 file to elasticsearch
def load_single_s3_file(s3_bucket, s3_key, es_host, es_port, es_index, es_type, access, secret):
try:
logging.info('loading s3://%s/%s', s3_bucket, s3_key)
s3 = boto3.client('s3', aws_access_key_id=access, aws_secret_access_key=secret)
file_handle = s3.get_object(Bucket=s3_bucket, Key=s3_key)
file_contents = file_handle['Body'].read()
logging.info('%s'%s3_key)
if file_contents:
if s3_key.endswith('.gz'):
file_contents = decompress_gzip(file_contents)
es = Elasticsearch(host=es_host, port=es_port, timeout=180)
res = es.get(index="test-index", doc_type='tweet', id=1)
es.insert(body = file_contents, index = es_index, doc_type=es_type, timeout=120)
except Exception as e:
logging.error("There has been a major error %s" % e)
start = datetime.now()
es_url = protocol + '://' + host + ':' + str(ports) + '/' + index + '/' + type
es = Elasticsearch(host=host, port=ports, timeout=180)
# S3 file - https://boto3.readthedocs.org/en/latest/reference/services/s3.html#object
s3 = boto3.client('s3', aws_access_key_id=access, aws_secret_access_key=secret)
s3_bucket, s3_key = parse_s3_path(mapping)
file_handle = s3.get_object(Bucket=s3_bucket, Key=s3_key)
mapping = file_handle['Body'].read()
try:
es.indices.create(index=index, body=mapping)
except:
logging.error('index exist')
logging.info('starting to load %s to %s', data, es_url)
es.indices.put_settings({'index': {'refresh_interval': '-1'}}, index=index)
pool = Pool(processes=int(threads))
s3 = boto3.resource('s3', aws_access_key_id=access, aws_secret_access_key=secret)
s3_bucket, s3_key = parse_s3_path(data)
for file_summary in s3.Bucket(s3_bucket).objects.all():
if file_summary.key.startswith(s3_key):
pool.apply_async(load_s3_file, args=(s3_bucket, file_summary.key, host, ports, index, type, access, secret))
pool.close()
pool.join()
es.indices.put_settings({'index': {'refresh_interval': '1s'}}, index=index)
logging.info('finished loading %s to %s in %s', data, es_url, str(datetime.now() - start))
sys.exit(0)
#reset_es_settings(host, ports)
#This is what is called when no arguments are given
#route('/load_data/')
def no_comands():
return """Please include all nessecary values: example:
Start Load
http://127.0.0.1:8001/load_data/load&host=ip or DNS&thread=5&mappinglocation=tr-ips-ses-data|mappings|version_1_2|wos.mapping&datalocation=tr-ips-ses-data|json-data|wos|20150724|wos-1&port=9200&index=wos4&protocol=http&type=wos&access=access_key&secret=secret_key
Delete Index
http://127.0.0.1:8001/delete/wos4&host=ip or DNS&port=9200
with loading you must specify the load command as shown above
use & to seperate values
use = to seperate key value pairs
use | to insert \
"""
#route('/load_data/<name>', method='GET')
def commands( name="Execute Load" ):
values = name.split('&')
#split apart the url syntax items are split by & key values by = and any plcae that needs \ gets |
try:
command = values[0]
host = values[1] + ".us-west-2.elb.amazonaws.com"
threads = values[2]
mapping_location = values[3].replace('|', '/')
data_location = values[4].replace('|', '/')
#mapping_location = values[3]
#data_location = values[4]
ports = values[5]
index = values[6]
protocol = values[7]
type = values[8]
access = values[9]
secret = values[10]
host = host.split('=')[1]
threads = threads.split('=')[1]
mapping_location = "s3://" + mapping_location.split('=')[1]
data_location = "s3://" + data_location.split('=')[1]
ports = ports.split('=')[1]
index = index.split('=')[1]
protocol = protocol.split('=')[1]
types = type.split('=')[1]
access = access.split('=')[1]
secret = secret.split('=')[1]
yield ("Starting Load of data use /get_status/es_url&es_port&index to get the status of your load.")
start_load(secret, access, protocol, host, ports, index, types, mapping_location, data_location,threads)
except Exception as e:
logging.error(e)
yield """Please include all nessecary values: example:
Start Load
http://127.0.0.1:8001/load_data/load&host=ip or DNS&thread=5&mappinglocation=tr-ips-ses-data|mappings|version_1_2|wos.mapping&datalocation=tr-ips-ses-data|json-data|wos|20150724|wos-1&port=9200&index=wos4&protocol=http&type=wos&access=access_key&secret=secret_key
Delete Index
http://127.0.0.1:8001/delete/wos4&host=ip or DNS&port=9200
with loading you must specify the load command as shown above
use & to seperate values
use = to seperate key value pairs
use | to insert \
"""
#This is what is cvalled when /delete/ is used.
#route('/delete/<name>', method='GET' )
def recipe_delete( name="Delete Index" ):
def shell_command_execute(command):
p = Popen(command, stdout=PIPE, shell=True)
(output, err) = p.communicate()
return output
values = name.split('&')
try:
#split apart the url syntax items are split by & key values by |
index = values[0]
host = values[1] + ".us-west-2.elb.amazonaws.com"
host = host.split('=')[1]
port = values[2]
port = port.split('=')[1]
except Exception as e:
logging.error(e)
return """Please include all nessecary values: example:
Start Load
http://127.0.0.1:8001/load_data/load&host=ip or DNS&thread=5&mappinglocation=tr-ips-ses-data|mappings|version_1_2|wos.mapping&datalocation=tr-ips-ses-data|json-data|wos|20150724|wos-1&port=9200&index=wos4&protocol=http&type=wos&access=access_key&secret=secret_key
Delete Index
http://127.0.0.1:8001/delete/wos4&host=ip or DNS&port=9200
with loading you must specify the load command as shown above
use & to seperate values
use = to seperate key value pairs
use | to insert \
"""
try:
#This is the command that deletes the index.
curl_command = 'curl -XDELETE http://' + host + ':9200/' + index
shell_command_execute(curl_command)
return "Successfully Deleted Index"
except Exception as e:
logging.error(e)
return "Failed to Deleted Index %s" % e
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf8')
url = os.path.dirname(os.path.realpath(__file__)) + '/logging.ini'
print url
logging.config.fileConfig(url)
run(host='172.31.28.189', port=8001, debug=True)
#run(host='127.0.0.1', port=8001, debug=True)

how can i find out the uploaded file name in python cgi

i made simple web server like below.
import BaseHTTPServer, os, cgi
import cgitb; cgitb.enable()
html = """
<html>
<body>
<form action="" method="POST" enctype="multipart/form-data">
File upload: <input type="file" name="upfile">
<input type="submit" value="upload">
</form>
</body>
</html>
"""
class Handler(BaseHTTPServer.BaseHTTPRequestHandler):
def do_GET(self):
self.send_response(200)
self.send_header("content-type", "text/html;charset=utf-8")
self.end_headers()
self.wfile.write(html)
def do_POST(self):
ctype, pdict = cgi.parse_header(self.headers.getheader('content-type'))
if ctype == 'multipart/form-data':
query = cgi.parse_multipart(self.rfile, pdict)
upfilecontent = query.get('upfile')
if upfilecontent:
# i don't know how to get the file name.. so i named it 'tmp.dat'
fout = file(os.path.join('tmp', 'tmp.dat'), 'wb')
fout.write (upfilecontent[0])
fout.close()
self.do_GET()
if __name__ == '__main__':
server = BaseHTTPServer.HTTPServer(("127.0.0.1", 8080), Handler)
print('web server on 8080..')
server.serve_forever()
In the do_Post method of BaseHTTPRequestHandler, i got the uploaded file data successfully.
But i can't figure out how to get the original name of the uploaded file.
self.rfile.name is just a 'socket'
How can i get the uploaded file name?
Pretty broken code you're using there as a starting point (e.g. look at that global rootnode where name rootnode is used nowhere -- clearly half-edited source, and badly at that).
Anyway, what form are you using "client-side" for the POST? How does it set that upfile field?
Why aren't you using the normal FieldStorage approach, as documented in Python's docs? That way, you could use the .file attribute of the appropriate field to get a file-like object to read, or its .value attribute to read it all in memory and get it as a string, plus the .filename attribute of the field to know the uploaded file's name. More detailed, though concise, docs on FieldStorage, are here.
Edit: now that the OP has edited the Q to clarify, I see the problem: BaseHTTPServer does not set the environment according to the CGI specs, so the cgi module isn't very usable with it. Unfortunately the only simple approach to environment setting is to steal and hack a big piece of code from CGIHTTPServer.py (wasn't intented for reuse, whence the need for, sigh, copy and paste coding), e.g....:
def populenv(self):
path = self.path
dir, rest = '.', 'ciao'
# find an explicit query string, if present.
i = rest.rfind('?')
if i >= 0:
rest, query = rest[:i], rest[i+1:]
else:
query = ''
# dissect the part after the directory name into a script name &
# a possible additional path, to be stored in PATH_INFO.
i = rest.find('/')
if i >= 0:
script, rest = rest[:i], rest[i:]
else:
script, rest = rest, ''
# Reference: http://hoohoo.ncsa.uiuc.edu/cgi/env.html
# XXX Much of the following could be prepared ahead of time!
env = {}
env['SERVER_SOFTWARE'] = self.version_string()
env['SERVER_NAME'] = self.server.server_name
env['GATEWAY_INTERFACE'] = 'CGI/1.1'
env['SERVER_PROTOCOL'] = self.protocol_version
env['SERVER_PORT'] = str(self.server.server_port)
env['REQUEST_METHOD'] = self.command
uqrest = urllib.unquote(rest)
env['PATH_INFO'] = uqrest
env['SCRIPT_NAME'] = 'ciao'
if query:
env['QUERY_STRING'] = query
host = self.address_string()
if host != self.client_address[0]:
env['REMOTE_HOST'] = host
env['REMOTE_ADDR'] = self.client_address[0]
authorization = self.headers.getheader("authorization")
if authorization:
authorization = authorization.split()
if len(authorization) == 2:
import base64, binascii
env['AUTH_TYPE'] = authorization[0]
if authorization[0].lower() == "basic":
try:
authorization = base64.decodestring(authorization[1])
except binascii.Error:
pass
else:
authorization = authorization.split(':')
if len(authorization) == 2:
env['REMOTE_USER'] = authorization[0]
# XXX REMOTE_IDENT
if self.headers.typeheader is None:
env['CONTENT_TYPE'] = self.headers.type
else:
env['CONTENT_TYPE'] = self.headers.typeheader
length = self.headers.getheader('content-length')
if length:
env['CONTENT_LENGTH'] = length
referer = self.headers.getheader('referer')
if referer:
env['HTTP_REFERER'] = referer
accept = []
for line in self.headers.getallmatchingheaders('accept'):
if line[:1] in "\t\n\r ":
accept.append(line.strip())
else:
accept = accept + line[7:].split(',')
env['HTTP_ACCEPT'] = ','.join(accept)
ua = self.headers.getheader('user-agent')
if ua:
env['HTTP_USER_AGENT'] = ua
co = filter(None, self.headers.getheaders('cookie'))
if co:
env['HTTP_COOKIE'] = ', '.join(co)
# XXX Other HTTP_* headers
# Since we're setting the env in the parent, provide empty
# values to override previously set values
for k in ('QUERY_STRING', 'REMOTE_HOST', 'CONTENT_LENGTH',
'HTTP_USER_AGENT', 'HTTP_COOKIE', 'HTTP_REFERER'):
env.setdefault(k, "")
os.environ.update(env)
This could be substantially simplified further, but not without spending some time and energy on that task:-(.
With this populenv function at hand, we can recode:
def do_POST(self):
populen(self)
form = cgi.FieldStorage(fp=self.rfile)
upfilecontent = form['upfile'].value
if upfilecontent:
fout = open(os.path.join('tmp', form['upfile'].filename), 'wb')
fout.write(upfilecontent)
fout.close()
self.do_GET()
...and live happily ever after;-). (Of course, using any decent WSGI server, or even the demo one, would be much easier, but this exercise is instructive about CGI and its internals;-).
By using cgi.FieldStorage you can easily extract the filename. Check the example below:
def do_POST(self):
ctype, pdict = cgi.parse_header(self.headers.getheader('content-type'))
if ctype == 'multipart/form-data':
form = cgi.FieldStorage( fp=self.rfile, headers=self.headers, environ={'REQUEST_METHOD':'POST', 'CONTENT_TYPE':self.headers['Content-Type'], })
filename = form['upfile'].filename
data = form['upfile'].file.read()
open("./%s"%filename, "wb").write(data)
self.do_GET()
...or use your own version of cgi.parse_multipart, especially fixing this:
# my fix: prefer 'filename' over 'name' field!
if 'filename' in params:
name = params['filename']
name = os.path.basename(name) # Edge, IE return abs path!
elif 'name' in params:
name = params['name']
else:
continue

Categories