AIRFLOW : Customise SFTPOperator to download multiple files - python

I'm trying to customise the SFTOperator take download multiple file from a server. I know that the original SFTPOperator only allow one file at a time.
I copied the same code from source and I twerk by adding a new function called get_xml_from_source(). Please refer the code below:
def get_xml_from_source(sftp_client, remote_filepath, local_filepath, prev_execution_date, execution_date):
"""
Copy from Source to local path
"""
files_attr = sftp_client.listdir_attr(remote_filepath) # eg: /source/ HITTING ERROR HERE
files_name = sftp_client.listdir(remote_filepath) # eg: /source/
today_midnight = datetime.combine(datetime.today(), time.min)
yesterday_midnight = today_midnight - timedelta(days=1)
for file_attr, file_name in zip(files_attr, files_name):
modified_time = datetime.fromtimestamp(file_attr.st_mtime)
if yesterday_midnight <= modified_time < today_midnight:
# if prev_execution_date <= modified_time < execution_date:
try:
# Download to local path
sftp_client.get(remote_filepath, local_filepath)
print(file_name)
except: # pylint: disable=bare-except
print("File not found")
else:
print("Not the file!")
Where this function will only download files from yesterday up to today.
I added the function at this line:
with self.ssh_hook.get_conn() as ssh_client:
sftp_client = ssh_client.open_sftp()
if self.operation.lower() == SFTPOperation.GET:
local_folder = os.path.dirname(self.local_filepath)
if self.create_intermediate_dirs:
# Create Intermediate Directories if it doesn't exist
try:
os.makedirs(local_folder)
except OSError:
if not os.path.isdir(local_folder):
raise
file_msg = "from {0} to {1}".format(self.remote_filepath,
self.local_filepath)
self.log.info("Starting to transfer %s", file_msg)
# This is where it starts to copy, customization begins here
# sftp_client.get(self.remote_filepath, self.local_filepath) <--- Original code that I commented out and replace with mine below
get_xml_from_source(sftp_client, self.remote_filepath,
self.local_filepath, self.prev_execution_date, self.execution_date)
Note that, rest of the codes did not change. It is how it looks like in the source.
I keep hitting error at files_attr = sftp_client.listdir_attr(remote_filepath) with this error:
Error while transferring from /source/ to
/path/to/destination, error: [Errno 2] No such file.
Which obviously meant, it can't find the sftp directory. I tried running the whole function locally, it works fine.
Is there any part of the code that tied the paramiko connection to only get one file? I checked the paramiko connection for SFTPOperator, it should be just fine. In this case, how should I fix it?
This is how I established my connection when running locally :
def connect_to_source():
"""
Get source credentials
:param: None
:return: username & password
"""
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
username, password = get_eet_credentials()
# key = paramiko.RSAKey.from_private_key_file(openssh_key, password=password)
ssh.connect(hostname=SFTP_SERVER, port=SFTP_PORT_NUMBER,
username=username, password=password)
client = ssh.open_sftp()
print("Connection to source success!")
return client
Lastly, below is my airflow task:
def copy_from_source():
"""
Copy XML file from source to local path
"""
return SFTPOperator(
task_id="copy_from_source",
ssh_conn_id="source_conn",
local_filepath=f"{current_dir}/destination",
remote_filepath= "/source/",
prev_execution_date='{{ prev_execution_date }}',
execution_date='{{ execution_date }}', # strftime("%Y-%m-%d %H:%M:%S")
create_intermediate_dirs=True,
operation="get",
dag=dag
)

I'm trying to do something similar to you. I'm not sure what is causing the issues you are facing but this is the updated SFTP Operator I have written that gets multiple files from a server
sftp_get_multiple_files_operator.py
import os
from pathlib import Path
from typing import Any
from airflow.exceptions import AirflowException
from airflow.models import BaseOperator
from airflow.contrib.hooks import SSHHook
class SFTPGetMultipleFilesOperator(BaseOperator):
template_fields = ('local_directory', 'remote_filename_pattern', 'remote_host')
def __init__(
self,
*,
ssh_hook=None,
ssh_conn_id=None,
remote_host=None,
local_directory=None,
remote_filename_pattern=None,
filetype=None,
confirm=True,
create_intermediate_dirs=False,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.ssh_hook = ssh_hook
self.ssh_conn_id = ssh_conn_id
self.remote_host = remote_host
self.local_directory = local_directory
self.filetype = filetype
self.remote_filename_pattern = remote_filename_pattern
self.confirm = confirm
self.create_intermediate_dirs = create_intermediate_dirs
def execute(self, context: Any) -> str:
file_msg = None
try:
if self.ssh_conn_id:
if self.ssh_hook and isinstance(self.ssh_hook, SSHHook):
self.log.info("ssh_conn_id is ignored when ssh_hook is provided.")
else:
self.log.info(
"ssh_hook is not provided or invalid. Trying ssh_conn_id to create SSHHook."
)
self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id)
if not self.ssh_hook:
raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.")
if self.remote_host is not None:
self.log.info(
"remote_host is provided explicitly. "
"It will replace the remote_host which was defined "
"in ssh_hook or predefined in connection of ssh_conn_id."
)
self.ssh_hook.remote_host = self.remote_host
with self.ssh_hook.get_conn() as ssh_client:
sftp_client = ssh_client.open_sftp()
all_files = sftp_client.listdir()
self.log.info(f'Found {len(all_files)} files on server')
timestamp = context['ds_nodash']
filename_pattern = self.remote_filename_pattern + timestamp
# fetch all CSV files for the run date that match the filename pattern
matching_files = [f for f in all_files
if f.find(filename_pattern) != -1]
# if file type is specified filter matching files for the file type
if self.filetype is not None:
matching_files = [filename for filename in matching_files
if filename[-len(self.filetype):] == self.filetype]
self.log.info(f'Found {len(matching_files)} files with name including {filename_pattern}')
local_folder = os.path.dirname(self.local_directory)
if self.create_intermediate_dirs:
Path(local_folder).mkdir(parents=True, exist_ok=True)
for f in matching_files:
self.log.info(f"Starting to transfer from /{f} to {self.local_directory}/{f}")
sftp_client.get(f'/{f}', f'{self.local_directory}/{f}')
except Exception as e:
raise AirflowException(f"Error while transferring {file_msg}, error: {str(e)}")
return self.local_directory
def _make_intermediate_dirs(sftp_client, remote_directory) -> None:
"""
Create all the intermediate directories in a remote host
:param sftp_client: A Paramiko SFTP client.
:param remote_directory: Absolute Path of the directory containing the file
:return:
"""
if remote_directory == '/':
sftp_client.chdir('/')
return
if remote_directory == '':
return
try:
sftp_client.chdir(remote_directory)
except OSError:
dirname, basename = os.path.split(remote_directory.rstrip('/'))
_make_intermediate_dirs(sftp_client, dirname)
sftp_client.mkdir(basename)
sftp_client.chdir(basename)
return
dag.py
sftp_report = SFTPGetMultipleFilesOperator(
task_id=f"sftp_reports_to_gcs",
ssh_conn_id="sftp_connection",
local_directory=f'/opt/airflow/dags/reports',
remote_filename_pattern=f'reportname_', # ds_nodash is added in the operator by accessing Airflow context
create_intermediate_dirs=True,
filetype='.csv'
)

Related

Get List of latest stable files from FTP server

I'm trying to get the list of files that are fully uploaded on the FTP server.
I have access to this FTP server where a 3rd party writes data and marker files every 15 minutes. Once the data file is completely uploaded then a marker file gets created. we know once this marker file is there that means data files are ready and we can download it. I'm looking for a way to efficiently approach this problem. I want to check every minute if there are any new stable files on FTP server, if there is then I'll download those files. one preferred way is see if the marker file is 2 minutes old then we are good to download marker file and corresponding data file.
I'm new with python and looking for help.
I have some code till I list out the files
import paramiko
from datetime import datetime, timedelta
FTP_HOST = 'host_address'
FTP_PORT = 21
FTP_USERNAME = 'username'
FTP_PASSWORD = 'password'
FTP_ROOT_PATH = 'path_to_dir'
def today():
return datetime.strftime(datetime.now(), '%Y%m%d')
def open_ftp_connection(ftp_host, ftp_port, ftp_username, ftp_password):
"""
Opens ftp connection and returns connection object
"""
client = paramiko.SSHClient()
client.load_system_host_keys()
try:
transport = paramiko.Transport(ftp_host, ftp_port)
except Exception as e:
return 'conn_error'
try:
transport.connect(username=ftp_username, password=ftp_password)
except Exception as identifier:
return 'auth_error'
ftp_connection = paramiko.SFTPClient.from_transport(transport)
return ftp_connection
def show_ftp_files_stat():
ftp_connection = open_ftp_connection(FTP_HOST, int(FTP_PORT), FTP_USERNAME, FTP_PASSWORD)
full_ftp_path = FTP_ROOT_PATH + "/" + today()
file_attr_list = ftp_connection.listdir_attr(full_ftp_path)
print(file_attr_list)
for file_attr in file_attr_list:
print(file_attr.filename, file_attr.st_size, file_attr.st_mtime)
if __name__ == '__main__':
show_ftp_files_stat()
Sample file name
org-reference-delta-quotes.REF.48C2.20200402.92.1.1.txt.gz
Sample corresponding marker file name
org-reference-delta-quotes.REF.48C2.20200402.92.note.txt.gz
I solved my use case with 2 min stable rule, if modified time is within 2 min of the current time, I consider them stable.
import logging
import time
from datetime import datetime, timezone
from ftplib import FTP
FTP_HOST = 'host_address'
FTP_PORT = 21
FTP_USERNAME = 'username'
FTP_PASSWORD = 'password'
FTP_ROOT_PATH = 'path_to_dir'
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
def today():
return datetime.strftime(datetime.now(tz=timezone.utc), '%Y%m%d')
def current_utc_ts():
return datetime.utcnow().timestamp()
def current_utc_ts_minus_120():
return int(datetime.utcnow().timestamp()) - 120
def yyyymmddhhmmss_string_epoch_ts(dt_string):
return time.mktime(time.strptime(dt_string, '%Y%m%d%H%M%S'))
def get_ftp_connection(ftp_host, ftp_username, ftp_password):
try:
ftp = FTP(ftp_host, ftp_username, ftp_password)
except Exception as e:
print(e)
logger.error(e)
return 'conn_error'
return ftp
def get_list_of_files(ftp_connection, date_to_process):
full_ftp_path = FTP_ROOT_PATH + "/" + date_to_process + "/"
ftp_connection.cwd(full_ftp_path)
entries = list(ftp_connection.mlsd())
entry_list = [line for line in entries if line[0].endswith('.gz') | line[0].endswith('.zip')]
ftp_connection.quit()
print('Total file count', len(entry_list))
return entry_list
def parse_file_list_to_dict(entries):
try:
file_dict_list = []
for line in entries:
file_dict = dict({"file_name": line[0],
"server_timestamp": int(yyyymmddhhmmss_string_epoch_ts(line[1]['modify'])),
"server_date": line[0].split(".")[3])
file_dict_list.append(file_dict)
except IndexError as e:
# Output expected IndexErrors.
logging.exception(e)
except Exception as exception:
# Output unexpected Exceptions.
logging.exception(exception, False)
return file_dict_list
def get_stable_files_dict_list(dict_list):
stable_list = list(filter(lambda d: d['server_timestamp'] < current_utc_ts_minus_120(), dict_list))
print('stable file count: {}'.format(len(stable_list)))
return stable_list
if __name__ == '__main__':
ftp_connection = get_ftp_connection(FTP_HOST, FTP_USERNAME, FTP_PASSWORD)
if ftp_connection == 'conn_error':
logger.error('Failed to connect FTP Server!')
else:
file_list = get_list_of_files(ftp_connection, today())
parse_file_list = parse_file_list_to_dict(file_list)
stable_file_list = get_stable_files_dict_list(parse_file_list)

Python Subprocess Module | Postgres pg_dump with password

I have a database that I want to back up with my a python code.
I tried to base my code from the code in this discussion that uses the subprocess module and pg_dump. My problem now is that I have to manually type in the password to get the backup file. I read somewhere that doing a .pgpass but I do want to see if it is possible to do it within the subprocess module.
My code follows below:
from subprocess import Popen, PIPE
from pathlib import Path, PureWindowsPath
def backup():
version = 11
postgresDir = Path("C:/Program Files/PostgreSQL/{}/bin/".format(version))
directory = PureWindowsPath(postgresDir)
filename = 'myBackUp2' # output filename here
saveDir = Path("D:/Desktop/{}.tar".format(filename)) # output directory here
file = PureWindowsPath(saveDir)
host = 'localhost'
user = 'postgres'
port = '5434'
dbname = 'database_name' # database name here
proc = Popen(['pg_dump', '-h', host, '-U', user, '-W', '-p', port,
'-F', 't', '-f', str(file), '-d', dbname],
cwd=directory, shell=True, stdin=PIPE)
proc.wait()
backup()
The code above works and the backup is created is I type in the password. I tried to replace proc.wait() with the code below to remove the need of typing the password manually:
return proc.communicate('{}\n'.format(database_password))
But I would receive this error:
TypeError: a bytes-like object is required, not 'str'
Is this possible to do within subprocess? If so, how?
Use a password file.
On Microsoft Windows the file is named %APPDATA%\postgresql\pgpass.conf (where %APPDATA% refers to the Application Data subdirectory in the user's profile).
and the -w or --no-password command line option (instead of -W)
-w
--no-password
Never issue a password prompt. If the server requires password authentication and a password is not available by other means such as a .pgpass file, the connection attempt will fail. This option can be useful in batch jobs and scripts where no user is present to enter a password.
The easiest is to use the PGPASSWORD environment variable.
There is two classes:
First class needed for create dsn string. Then try to connect
with dsn parameters. If cannot connect go to Second class.
Second class needed for create for create DataBase and restore all tables
from file. You need to remake this strings:
for correctly open your DataBase dump_file
__folder_name = Path(__file__).parent.parent
__folder_name_data = os.path.join(__folder_name, 'data')
__file_to_open = os.path.join(__folder_name_data, 'bd.backup')
import os
import textwrap
from pathlib import Path
from subprocess import Popen, PIPE
class DataBaseAPI:
__slots__ = ('__dsn', 'cur')
def __init__(self):
self.__dsn = self.__dsn_string()
self.cur = self.__connection()
#staticmethod
def __dsn_string() -> dict:
print(f'INPUT name of DataBase')
name = input()
print(f'INPUT password of DataBase')
password = input()
print(f'INPUT user_name of DataBase or press ENTER if user_name="postgres"')
user_name = input()
if len(user_name) == 0:
user_name = 'postgres'
print(f'INPUT host_name of DataBase or press ENTER if host_name="localhost"')
host_name = input()
if len(host_name) == 0:
host_name = 'localhost'
return {'dbname': name, 'user': user_name, 'password': password, 'host': host_name}
def __connection(self):
try:
conn = psycopg2.connect(dbname=self.__dsn['dbname'], user=self.__dsn['user'],
host=self.__dsn['host'], password=self.__dsn['password'], port=5432)
except psycopg2.OperationalError:
print(textwrap.fill(f'There is no existing DataBase. Creating new DataBase', 80,
subsequent_indent=' '))
DataBaseCreator(self.__dsn)
conn = psycopg2.connect(dbname=self.__dsn['dbname'], user=self.__dsn['user'],
host=self.__dsn['host'], password=self.__dsn['password'], port=5432)
finally:
conn.autocommit = True
cur = conn.cursor()
print(f'DataBase connection complete')
return cur
class DataBaseCreator:
def __init__(self, dsn):
self.__dsn = dsn
self.__check_conf_file()
self.__create_data_base()
self.__restore_data_base()
def __check_conf_file(self):
__app_data = os.environ.copy()["APPDATA"]
__postgres_path = Path(f'{__app_data}\postgresql')
__pgpass_file = Path(f'{__postgres_path}\pgpass.conf')
parameters = f'{self.__dsn["host"]}:{5432}:{self.__dsn["dbname"]}:' \
f'{self.__dsn["user"]}:{int(self.__dsn["password"])}\n'
if not os.path.isdir(__postgres_path):
os.makedirs(__postgres_path)
if os.path.isfile(__pgpass_file):
log.debug(f'File "pgpass.conf" already exists')
with open(__pgpass_file, 'r+') as f:
content = f.readlines()
if parameters not in content:
# сервер: порт:база_данных: имя_пользователя:пароль
f.write(parameters)
else:
log.info(f' {parameters} already in "pgpass.conf" file')
else:
log.debug(f'File "pgpass.conf" not exists')
with open(__pgpass_file, 'x') as f:
# сервер: порт:база_данных: имя_пользователя:пароль
f.write(parameters)
def __create_data_base(self):
try:
__conn = psycopg2.connect(dbname='postgres', user=self.__dsn['user'],
host=self.__dsn['host'], password=self.__dsn['password'], port=5432)
except Exception as _:
log.exception(f'{_}')
else:
__conn.autocommit = True
__cur = __conn.cursor()
__query = f'CREATE DATABASE "{self.__dsn["dbname"]}"'
__cur.execute(__query)
log.info(f'{__query}')
def __restore_data_base(self):
__col = [x for x in self.__dsn.values()]
__folder_name = Path(__file__).parent.parent
__folder_name_data = os.path.join(__folder_name, 'data')
__file_to_open = os.path.join(__folder_name_data, 'bd.backup')
__cmd = f'pg_restore --host={__col[3]} --dbname={__col[0]} --username={__col[1]} ' \
f'--verbose=True --no-password ' \
f'{__file_to_open}'
try:
__proc = Popen(__cmd, stdout=PIPE, stderr=PIPE)
except FileNotFoundError:
log.info(f'FileNotFoundError: [WinError 2] Не удается найти указанный файл')
log.info(textwrap.fill(f'You need to SET Windows $PATH for use "pg_restore" in cmd', 80,
subsequent_indent=' '))
else:
__stderr = __proc.communicate()[1].decode('utf-8', errors="ignore").strip()
log.debug(textwrap.fill(f'{__stderr}', 80))
One more option is to use dbname parameter
'pg_dump --dbname=postgresql://{}:{}#{}:{}/{}'.format(user, password, host, port, database_name)

Multiprocessing Python Bottle Multiple Commands independent of a running pool.apply_async

The Problem, I made a rest api out of bottle to start Elasticsearch bulk loads. The bulk load process runs inside of a multiprocess pool, the problem is while that is running the api wont except any other commands.
I've even tried running another instance of the api on a different port but I have stack question about that too. The second one doesn't accept commands.
I want to be able to call a different api command to get the status of the load and return it. Currently it just includes ES data, but eventually its going to include each nodes stats. This is designed to run from Jenkins and initiate parallel loads.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright [current year] the Melange authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import RequestError
from subprocess import Popen, PIPE
from multiprocessing import Pool, Process, pool
from datetime import datetime
import boto3
import sys
import os
import argparse
import logging
import logging.config
from bottle import route, run
from boto.cloudformation.stack import Output
import json
#this is what is called to set up the loading process from the api.
def start_load(secret, access, protocol, host, ports, index, type, mapping, data,threads):
# decompress a gzip string
def decompress_gzip(data):
return Popen(['zcat'], stdout=PIPE, stdin=PIPE).communicate(input=data)[0]
# parse an s3 path into a bucket and key 's3://my-bucket/path/to/data' -> ('my-bucket', 'path/to/data')
def parse_s3_path(str):
_, _, bucket, key = str.split('/', 3)
return (bucket, key)
def shell_command_execute(command):
p = Popen(command, stdout=PIPE, shell=True)
(output, err) = p.communicate()
return output
# load an S3 file to elasticsearch
def load_s3_file(s3_bucket, s3_key, es_host, es_port, es_index, es_type, access, secret):
try:
logging.info('loading s3://%s/%s', s3_bucket, s3_key)
s3 = boto3.client('s3', aws_access_key_id=access, aws_secret_access_key=secret)
file_handle = s3.get_object(Bucket=s3_bucket, Key=s3_key)
file_contents = file_handle['Body'].read()
logging.info('%s'%s3_key)
if file_contents:
if s3_key.endswith('.gz'):
file_contents = decompress_gzip(file_contents)
es = Elasticsearch(host=es_host, port=es_port, timeout=180)
es.bulk(body=file_contents, index=es_index, doc_type=es_type, timeout=120)
except Exception as e:
logging.error("There has been a major error %s" % e)
# load an S3 file to elasticsearch
def load_single_s3_file(s3_bucket, s3_key, es_host, es_port, es_index, es_type, access, secret):
try:
logging.info('loading s3://%s/%s', s3_bucket, s3_key)
s3 = boto3.client('s3', aws_access_key_id=access, aws_secret_access_key=secret)
file_handle = s3.get_object(Bucket=s3_bucket, Key=s3_key)
file_contents = file_handle['Body'].read()
logging.info('%s'%s3_key)
if file_contents:
if s3_key.endswith('.gz'):
file_contents = decompress_gzip(file_contents)
es = Elasticsearch(host=es_host, port=es_port, timeout=180)
res = es.get(index="test-index", doc_type='tweet', id=1)
es.insert(body = file_contents, index = es_index, doc_type=es_type, timeout=120)
except Exception as e:
logging.error("There has been a major error %s" % e)
start = datetime.now()
es_url = protocol + '://' + host + ':' + str(ports) + '/' + index + '/' + type
es = Elasticsearch(host=host, port=ports, timeout=180)
# S3 file - https://boto3.readthedocs.org/en/latest/reference/services/s3.html#object
s3 = boto3.client('s3', aws_access_key_id=access, aws_secret_access_key=secret)
s3_bucket, s3_key = parse_s3_path(mapping)
file_handle = s3.get_object(Bucket=s3_bucket, Key=s3_key)
mapping = file_handle['Body'].read()
try:
es.indices.create(index=index, body=mapping)
except:
logging.error('index exist')
logging.info('starting to load %s to %s', data, es_url)
es.indices.put_settings({'index': {'refresh_interval': '-1'}}, index=index)
pool = Pool(processes=int(threads))
s3 = boto3.resource('s3', aws_access_key_id=access, aws_secret_access_key=secret)
s3_bucket, s3_key = parse_s3_path(data)
for file_summary in s3.Bucket(s3_bucket).objects.all():
if file_summary.key.startswith(s3_key):
pool.apply_async(load_s3_file, args=(s3_bucket, file_summary.key, host, ports, index, type, access, secret))
pool.close()
pool.join()
es.indices.put_settings({'index': {'refresh_interval': '1s'}}, index=index)
logging.info('finished loading %s to %s in %s', data, es_url, str(datetime.now() - start))
sys.exit(0)
#reset_es_settings(host, ports)
#This is what is called when no arguments are given
#route('/load_data/')
def no_comands():
return """Please include all nessecary values: example:
Start Load
http://127.0.0.1:8001/load_data/load&host=ip or DNS&thread=5&mappinglocation=tr-ips-ses-data|mappings|version_1_2|wos.mapping&datalocation=tr-ips-ses-data|json-data|wos|20150724|wos-1&port=9200&index=wos4&protocol=http&type=wos&access=access_key&secret=secret_key
Delete Index
http://127.0.0.1:8001/delete/wos4&host=ip or DNS&port=9200
with loading you must specify the load command as shown above
use & to seperate values
use = to seperate key value pairs
use | to insert \
"""
#route('/load_data/<name>', method='GET')
def commands( name="Execute Load" ):
values = name.split('&')
#split apart the url syntax items are split by & key values by = and any plcae that needs \ gets |
try:
command = values[0]
host = values[1] + ".us-west-2.elb.amazonaws.com"
threads = values[2]
mapping_location = values[3].replace('|', '/')
data_location = values[4].replace('|', '/')
#mapping_location = values[3]
#data_location = values[4]
ports = values[5]
index = values[6]
protocol = values[7]
type = values[8]
access = values[9]
secret = values[10]
host = host.split('=')[1]
threads = threads.split('=')[1]
mapping_location = "s3://" + mapping_location.split('=')[1]
data_location = "s3://" + data_location.split('=')[1]
ports = ports.split('=')[1]
index = index.split('=')[1]
protocol = protocol.split('=')[1]
types = type.split('=')[1]
access = access.split('=')[1]
secret = secret.split('=')[1]
yield ("Starting Load of data use /get_status/es_url&es_port&index to get the status of your load.")
start_load(secret, access, protocol, host, ports, index, types, mapping_location, data_location,threads)
except Exception as e:
logging.error(e)
yield """Please include all nessecary values: example:
Start Load
http://127.0.0.1:8001/load_data/load&host=ip or DNS&thread=5&mappinglocation=tr-ips-ses-data|mappings|version_1_2|wos.mapping&datalocation=tr-ips-ses-data|json-data|wos|20150724|wos-1&port=9200&index=wos4&protocol=http&type=wos&access=access_key&secret=secret_key
Delete Index
http://127.0.0.1:8001/delete/wos4&host=ip or DNS&port=9200
with loading you must specify the load command as shown above
use & to seperate values
use = to seperate key value pairs
use | to insert \
"""
#This is what is cvalled when /delete/ is used.
#route('/delete/<name>', method='GET' )
def recipe_delete( name="Delete Index" ):
def shell_command_execute(command):
p = Popen(command, stdout=PIPE, shell=True)
(output, err) = p.communicate()
return output
values = name.split('&')
try:
#split apart the url syntax items are split by & key values by |
index = values[0]
host = values[1] + ".us-west-2.elb.amazonaws.com"
host = host.split('=')[1]
port = values[2]
port = port.split('=')[1]
except Exception as e:
logging.error(e)
return """Please include all nessecary values: example:
Start Load
http://127.0.0.1:8001/load_data/load&host=ip or DNS&thread=5&mappinglocation=tr-ips-ses-data|mappings|version_1_2|wos.mapping&datalocation=tr-ips-ses-data|json-data|wos|20150724|wos-1&port=9200&index=wos4&protocol=http&type=wos&access=access_key&secret=secret_key
Delete Index
http://127.0.0.1:8001/delete/wos4&host=ip or DNS&port=9200
with loading you must specify the load command as shown above
use & to seperate values
use = to seperate key value pairs
use | to insert \
"""
try:
#This is the command that deletes the index.
curl_command = 'curl -XDELETE http://' + host + ':9200/' + index
shell_command_execute(curl_command)
return "Successfully Deleted Index"
except Exception as e:
logging.error(e)
return "Failed to Deleted Index %s" % e
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf8')
url = os.path.dirname(os.path.realpath(__file__)) + '/logging.ini'
print url
logging.config.fileConfig(url)
run(host='172.31.28.189', port=8001, debug=True)
#run(host='127.0.0.1', port=8001, debug=True)

Threads in python are using the same variable it seems

I built a class that watches for changes in a directory and upload them to a server, it is working fine for one dir. However, i had the idea to use the threading module from python to actually watch more than one directory. But, i am getting confused, since when i change a file in one location, it uploads just fine, but then the OTHER location starts uploading all it's files. I think it's because somehow the threads are sharing the same variable or something, but still it's impossible because each directory has it's own instance of the class working specifically for it.
Here's some code:
import os, ftplib, time
from threading import Thread
class FTPSync(Thread):
local_root = ''
remote_root = ''
host = ''
user = ''
password = ''
content = {
'previous': [],
'current': []
}
ignore = []
rest = 0.5
files = []
cwd = ''
watching = True
def __init__(self, local_root='', remote_root='', config={}):
Thread.__init__(self)
self.local_root = local_root if local_root != '' else os.path.join(os.path.dirname(__file__), os.pardir)
self.remote_root = remote_root
self.ignore = config['ignore'] if 'ignore' in config else []
self.rest = config['rest'] if 'rest' in config else 0.5
self.host, self.user, self.password = config['host'], config['user'], config['password']
self.content['previous'] = self.read_dir(self.local_root)
# Connect and reconnect to the server
def connect(self, reconnect=False):
print "Connecting..."
self.ftp = ftplib.FTP(self.host)
self.ftp.login(self.user, self.password)
print "Welcome message from server:\n"
print self.ftp.getwelcome()
if not reconnect:
self.cwd = self.remote_root
self.ftp.cwd(self.cwd)
# Start watching for local changes
def watch(self):
self.connect()
while self.watching:
self.files = []
self.content['current'] = self.read_dir(self.local_root)
diff = [f for f in self.content['current'] if f not in self.content['previous']]
if len(diff) > 0:
self.stor(diff)
self.content['previous'] = self.content['current']
diff = []
time.sleep(self.rest)
# Read a directory and its contents recursively
def read_dir(self, dir_name, return_value=True):
reading = os.listdir(dir_name)
file_content = None
for i in range(len(reading)):
d = self._local_abspath(dir_name, reading[i])
is_dir = os.path.isdir(d)
file_content = open(d).read() if not is_dir else None
offset = d.replace(self.local_root, '').replace(reading[i], '')
if is_dir and reading[i] not in self.ignore:
self.read_dir(d, return_value=False)
elif not is_dir:
info = {"name": reading[i], "content": file_content, "local_path": d, "offset": offset}
self.files.append(info)
if (return_value):
return self.files
pass
# Here we go
def run(self):
self.watch()
# Store (STOR) the files in the server
def stor(self, files):
nav = ''
try:
for f in files:
if self._server_abspath(f['offset']) != self.cwd:
nav = self._server_abspath(f['offset'])
self.ftp.cwd(nav)
mode = ''
if f['name'].split('.')[-1:][0] in ['jpg', 'png', 'gif'] or os.path.getsize(f['local_path']) > 8190:
mode = 'binary'
self.ftp.storbinary('STOR {!s}'.format(f['name']), open(f['local_path']))
else:
mode = 'ascii'
self.ftp.storlines('STOR {!s}'.format(f['name']), open(f['local_path']))
self.cwd = self._server_abspath(f['offset'])
print "Stored %s in %s mode" % (f['name'], mode)
# The connection has timed out
except ftplib.error_temp:
self.connect(reconnect=True)
self.stor(files)
# A new file has been created inside a folder that does not exist in the server
except ftplib.error_perm:
self.ftp.mkd(nav)
self.stor(files)
# A new folder has been created locally, but we'll wait to update this on the server
# when there's some content inside of it and throw us a ftplib.error_perm error, so here it'll just pass
except IOError:
pass
# Return the absolute path in the server
def _server_abspath(self, path):
return self.remote_root + '/' + path.replace('\\', '/')
# Return the absolute path locally
def _local_abspath(self, dn, fn):
return (dn +'\\'+ fn) if not dn[-1:]=='\\' else dn + fn
def start(local_root='', remote_root='', config={}):
instance = FTPSync(local_root, remote_root, config)
instance.start()
return instance
And this is how i use the class:
import ftpsync
config = {
'host': 'ftp.myhost.com',
'user': '****',
'password': '****',
'ignore': ['.git']
}
ftpsync.start(remote_root='/www/tst', config=config)
ftpsync.start(local_root='C:\\pygames', remote_root='/www/tst', config=config)
I would like to remember that it works fine for ONE directory.
After some time, I realized I had to use processes. I came back here in case someone finds it useful.
So basically, with threads you're just running two or more concurrent things at once, but they all share the same address space and memory, and can cause some unwanted things by having the same context and interacting with each other.
Now with processes, every process is independent from one another, so they all have resources reserved for each one of them. This won't let them share variables and stuff.

Python fast static file serving

What's the fastest way to serve static files in Python? I'm looking for something equal or close enough to Nginx's static file serving.
I know of SimpleHTTPServer but not sure if it can handle serving multiple files efficiently and reliably.
Also, I don't mind it being a part of a lib/framework of some sort as long as its lib/framework is lightweight.
EDIT: This project appears to be dead.
What about FAPWS3? One of the selling points:
Static file server
FAPWS can be used to serve a huge amount of static file requests. With the help of a async database in the backend, you can use FAPWS as your own Amazon S3.
If you look for a oneliner you can do the following:
$> python -m SimpleHTTPServer
This will not fullfil all the task required but worth mentioning that this is the simplest way :-)
I would highly recommend using a 3rd party HTTP server to serve static files.
Servers like nginx are heavily optimized for the task at hand, parallelized and written in fast languages.
Python is tied to one processor and interpreted.
Original SimpleHTTPServer from python standard library does NOT "handle serving multiple files efficiently and reliably". For instance, if you are downloading one file from it, another HTTP access to it must be hovering since SimpleHTTPServer.py is a simple singal-thread HTTP server which could only support one connecting simultaneously.
Fortunately, note that SimpleHTTPServer.py use BaseHTTPServer.HTTPServer as handler, which can be wrapped by SocketServer.ForkingMixIn and SocketServer.ThreadingMixIn also from python standard library to support multi-process and multi-thread mode, which could highly enhance simple HTTP server's "efficience and reliability".
According to this idea, a SimpleHTTPServer with multi-thread/multi-process support modified from original one is given as follows:
$ python2.7 ModifiedSimpleHTTPServer.py
usage: ModifiedSimpleHTTPServer.py [-h] [--pydoc] [--port PORT]
[--type {process,thread}] [--root ROOT]
[--run]
Modified SimpleHTTPServer with MultiThread/MultiProcess and IP bind support.
Original: https://docs.python.org/2.7/library/simplehttpserver.html
Modified by: vbem#163.com
optional arguments:
-h, --help show this help message and exit
--pydoc show this module's pydoc
run arguments:
--port PORT specify server port (default: 8000)
--type {process,thread}
specify server type (default: 'thread')
--root ROOT specify root directory (default: cwd '/home/vbem')
--run run http server foreground
NOTE: stdin for input, stdout for result, stderr for logging
For example, ModifiedSimpleHTTPServer.py --run --root /var/log --type process will run a multi-process HTTP static files server with '/var/log' as its root directory.
Modified codes are:
#! /usr/bin/env python2.7
# -*- coding: utf-8 -*-
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
r"""Modified SimpleHTTPServer with MultiThread/MultiProcess and IP bind support.
Original: https://docs.python.org/2.7/library/simplehttpserver.html
Modified by: vbem#163.com
"""
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
import os, sys, pwd, posixpath, BaseHTTPServer, urllib, cgi, shutil, mimetypes, socket, SocketServer, BaseHTTPServer
from cStringIO import StringIO
USERNAME = pwd.getpwuid(os.getuid()).pw_name
HOSTNAME = socket.gethostname()
PORT_DFT = 8000
class SimpleHTTPRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
server_version = "SimpleHTTP/0.6"
def do_GET(self):
f = self.send_head()
if f:
self.copyfile(f, self.wfile)
f.close()
def do_HEAD(self):
f = self.send_head()
if f:
f.close()
def send_head(self):
path = self.translate_path(self.path)
f = None
if os.path.isdir(path):
if not self.path.endswith('/'):
self.send_response(301)
self.send_header("Location", self.path + "/")
self.end_headers()
return None
for index in "index.html", "index.htm":
index = os.path.join(path, index)
if os.path.exists(index):
path = index
break
else:
return self.list_directory(path)
ctype = self.guess_type(path)
try:
f = open(path, 'rb')
except IOError:
self.send_error(404, "File not found")
return None
self.send_response(200)
self.send_header("Content-type", ctype)
fs = os.fstat(f.fileno())
self.send_header("Content-Length", str(fs[6]))
self.send_header("Last-Modified", self.date_time_string(fs.st_mtime))
self.end_headers()
return f
def list_directory(self, path):
try:
list = ['..'] + os.listdir(path) #
except os.error:
self.send_error(404, "No permission to list directory")
return None
list.sort(key=lambda a: a.lower())
f = StringIO()
displaypath = cgi.escape(urllib.unquote(self.path))
f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">')
f.write("<html>\n<title>%s %s</title>\n<body>" % (HOSTNAME, displaypath))
f.write("%s#%s:<strong>%s</strong>\n" % (USERNAME, HOSTNAME, path.rstrip('/')+'/'))
f.write("<hr>\n<ul>\n")
for name in list:
fullname = os.path.join(path, name)
displayname = linkname = name
if os.path.isdir(fullname):
displayname = name + "/"
linkname = name + "/"
if os.path.islink(fullname):
displayname = name + "#"
f.write('<li>%s\n'
% (urllib.quote(linkname), cgi.escape(displayname)))
f.write("</ul>\n<hr>\n<pre>%s</pre>\n</body>\n</html>\n" % __doc__)
length = f.tell()
f.seek(0)
self.send_response(200)
encoding = sys.getfilesystemencoding()
self.send_header("Content-type", "text/html; charset=%s" % encoding)
self.send_header("Content-Length", str(length))
self.end_headers()
return f
def translate_path(self, path):
path = path.split('?',1)[0]
path = path.split('#',1)[0]
path = posixpath.normpath(urllib.unquote(path))
words = path.split('/')
words = filter(None, words)
path = os.getcwd()
for word in words:
drive, word = os.path.splitdrive(word)
head, word = os.path.split(word)
if word in (os.curdir, os.pardir): continue
path = os.path.join(path, word)
return path
def copyfile(self, source, outputfile):
shutil.copyfileobj(source, outputfile)
def guess_type(self, path):
base, ext = posixpath.splitext(path)
if ext in self.extensions_map:
return self.extensions_map[ext]
ext = ext.lower()
if ext in self.extensions_map:
return self.extensions_map[ext]
else:
return self.extensions_map['']
if not mimetypes.inited:
mimetypes.init()
extensions_map = mimetypes.types_map.copy()
extensions_map.update({'': 'text/plain'})
class ProcessedHTTPServer(SocketServer.ForkingMixIn, BaseHTTPServer.HTTPServer):
r"""Handle requests in multi process."""
class ThreadedHTTPServer(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
r"""Handle requests in a separate thread."""
SERVER_DICT = {
'thread' : ThreadedHTTPServer,
'process' : ProcessedHTTPServer,
}
SERVER_DFT = 'thread'
def run(sCwd=None, sServer=SERVER_DFT, nPort=PORT_DFT, *lArgs, **dArgs):
r"""
"""
sys.stderr.write('start with %r\n' % sys._getframe().f_locals)
if sCwd is not None:
os.chdir(sCwd)
cServer = SERVER_DICT[sServer]
oHttpd = cServer(("", nPort), SimpleHTTPRequestHandler)
sys.stderr.write('http://%s:%s/\n' % (HOSTNAME, nPort))
oHttpd.serve_forever()
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# main
def _main():
r"""Main.
"""
import argparse
oParser = argparse.ArgumentParser(
description = __doc__,
formatter_class = argparse.RawTextHelpFormatter,
epilog = 'NOTE: stdin for input, stdout for result, stderr for logging',
)
oParser.add_argument('--pydoc', action='store_true',
help = "show this module's pydoc",
)
oGroupR = oParser.add_argument_group(title='run arguments', description='')
oGroupR.add_argument('--port', action='store', type=int, default=PORT_DFT,
help = 'specify server port (default: %(default)r)',
)
oGroupR.add_argument('--type', action='store', default=SERVER_DFT, choices=SERVER_DICT.keys(),
help = 'specify server type (default: %(default)r)',
)
oGroupR.add_argument('--root', action='store', default=os.getcwd(),
help = 'specify root directory (default: cwd %(default)r)',
)
oGroupR.add_argument('--run', action='store_true',
help = '\n'.join((
'run http server foreground',
)))
oArgs = oParser.parse_args()
if oArgs.pydoc:
help(os.path.splitext(os.path.basename(__file__))[0])
elif oArgs.run:
return run(sCwd=oArgs.root, sServer=oArgs.type, nPort=oArgs.port)
else:
oParser.print_help()
return 1
return 0
if __name__ == "__main__":
exit(_main())
Meanwhile, the single python file with only 200 lines may satisfy your "in Python" and "lightweight" demands.
Last but not least, this ModifiedSimpleHTTPServer.py may be a "killer app" by hand for temporary use, however, Nginx is advised for long term use.

Categories