I'm trying to get the list of files that are fully uploaded on the FTP server.
I have access to this FTP server where a 3rd party writes data and marker files every 15 minutes. Once the data file is completely uploaded then a marker file gets created. we know once this marker file is there that means data files are ready and we can download it. I'm looking for a way to efficiently approach this problem. I want to check every minute if there are any new stable files on FTP server, if there is then I'll download those files. one preferred way is see if the marker file is 2 minutes old then we are good to download marker file and corresponding data file.
I'm new with python and looking for help.
I have some code till I list out the files
import paramiko
from datetime import datetime, timedelta
FTP_HOST = 'host_address'
FTP_PORT = 21
FTP_USERNAME = 'username'
FTP_PASSWORD = 'password'
FTP_ROOT_PATH = 'path_to_dir'
def today():
return datetime.strftime(datetime.now(), '%Y%m%d')
def open_ftp_connection(ftp_host, ftp_port, ftp_username, ftp_password):
"""
Opens ftp connection and returns connection object
"""
client = paramiko.SSHClient()
client.load_system_host_keys()
try:
transport = paramiko.Transport(ftp_host, ftp_port)
except Exception as e:
return 'conn_error'
try:
transport.connect(username=ftp_username, password=ftp_password)
except Exception as identifier:
return 'auth_error'
ftp_connection = paramiko.SFTPClient.from_transport(transport)
return ftp_connection
def show_ftp_files_stat():
ftp_connection = open_ftp_connection(FTP_HOST, int(FTP_PORT), FTP_USERNAME, FTP_PASSWORD)
full_ftp_path = FTP_ROOT_PATH + "/" + today()
file_attr_list = ftp_connection.listdir_attr(full_ftp_path)
print(file_attr_list)
for file_attr in file_attr_list:
print(file_attr.filename, file_attr.st_size, file_attr.st_mtime)
if __name__ == '__main__':
show_ftp_files_stat()
Sample file name
org-reference-delta-quotes.REF.48C2.20200402.92.1.1.txt.gz
Sample corresponding marker file name
org-reference-delta-quotes.REF.48C2.20200402.92.note.txt.gz
I solved my use case with 2 min stable rule, if modified time is within 2 min of the current time, I consider them stable.
import logging
import time
from datetime import datetime, timezone
from ftplib import FTP
FTP_HOST = 'host_address'
FTP_PORT = 21
FTP_USERNAME = 'username'
FTP_PASSWORD = 'password'
FTP_ROOT_PATH = 'path_to_dir'
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
def today():
return datetime.strftime(datetime.now(tz=timezone.utc), '%Y%m%d')
def current_utc_ts():
return datetime.utcnow().timestamp()
def current_utc_ts_minus_120():
return int(datetime.utcnow().timestamp()) - 120
def yyyymmddhhmmss_string_epoch_ts(dt_string):
return time.mktime(time.strptime(dt_string, '%Y%m%d%H%M%S'))
def get_ftp_connection(ftp_host, ftp_username, ftp_password):
try:
ftp = FTP(ftp_host, ftp_username, ftp_password)
except Exception as e:
print(e)
logger.error(e)
return 'conn_error'
return ftp
def get_list_of_files(ftp_connection, date_to_process):
full_ftp_path = FTP_ROOT_PATH + "/" + date_to_process + "/"
ftp_connection.cwd(full_ftp_path)
entries = list(ftp_connection.mlsd())
entry_list = [line for line in entries if line[0].endswith('.gz') | line[0].endswith('.zip')]
ftp_connection.quit()
print('Total file count', len(entry_list))
return entry_list
def parse_file_list_to_dict(entries):
try:
file_dict_list = []
for line in entries:
file_dict = dict({"file_name": line[0],
"server_timestamp": int(yyyymmddhhmmss_string_epoch_ts(line[1]['modify'])),
"server_date": line[0].split(".")[3])
file_dict_list.append(file_dict)
except IndexError as e:
# Output expected IndexErrors.
logging.exception(e)
except Exception as exception:
# Output unexpected Exceptions.
logging.exception(exception, False)
return file_dict_list
def get_stable_files_dict_list(dict_list):
stable_list = list(filter(lambda d: d['server_timestamp'] < current_utc_ts_minus_120(), dict_list))
print('stable file count: {}'.format(len(stable_list)))
return stable_list
if __name__ == '__main__':
ftp_connection = get_ftp_connection(FTP_HOST, FTP_USERNAME, FTP_PASSWORD)
if ftp_connection == 'conn_error':
logger.error('Failed to connect FTP Server!')
else:
file_list = get_list_of_files(ftp_connection, today())
parse_file_list = parse_file_list_to_dict(file_list)
stable_file_list = get_stable_files_dict_list(parse_file_list)
Related
I'm trying to customise the SFTOperator take download multiple file from a server. I know that the original SFTPOperator only allow one file at a time.
I copied the same code from source and I twerk by adding a new function called get_xml_from_source(). Please refer the code below:
def get_xml_from_source(sftp_client, remote_filepath, local_filepath, prev_execution_date, execution_date):
"""
Copy from Source to local path
"""
files_attr = sftp_client.listdir_attr(remote_filepath) # eg: /source/ HITTING ERROR HERE
files_name = sftp_client.listdir(remote_filepath) # eg: /source/
today_midnight = datetime.combine(datetime.today(), time.min)
yesterday_midnight = today_midnight - timedelta(days=1)
for file_attr, file_name in zip(files_attr, files_name):
modified_time = datetime.fromtimestamp(file_attr.st_mtime)
if yesterday_midnight <= modified_time < today_midnight:
# if prev_execution_date <= modified_time < execution_date:
try:
# Download to local path
sftp_client.get(remote_filepath, local_filepath)
print(file_name)
except: # pylint: disable=bare-except
print("File not found")
else:
print("Not the file!")
Where this function will only download files from yesterday up to today.
I added the function at this line:
with self.ssh_hook.get_conn() as ssh_client:
sftp_client = ssh_client.open_sftp()
if self.operation.lower() == SFTPOperation.GET:
local_folder = os.path.dirname(self.local_filepath)
if self.create_intermediate_dirs:
# Create Intermediate Directories if it doesn't exist
try:
os.makedirs(local_folder)
except OSError:
if not os.path.isdir(local_folder):
raise
file_msg = "from {0} to {1}".format(self.remote_filepath,
self.local_filepath)
self.log.info("Starting to transfer %s", file_msg)
# This is where it starts to copy, customization begins here
# sftp_client.get(self.remote_filepath, self.local_filepath) <--- Original code that I commented out and replace with mine below
get_xml_from_source(sftp_client, self.remote_filepath,
self.local_filepath, self.prev_execution_date, self.execution_date)
Note that, rest of the codes did not change. It is how it looks like in the source.
I keep hitting error at files_attr = sftp_client.listdir_attr(remote_filepath) with this error:
Error while transferring from /source/ to
/path/to/destination, error: [Errno 2] No such file.
Which obviously meant, it can't find the sftp directory. I tried running the whole function locally, it works fine.
Is there any part of the code that tied the paramiko connection to only get one file? I checked the paramiko connection for SFTPOperator, it should be just fine. In this case, how should I fix it?
This is how I established my connection when running locally :
def connect_to_source():
"""
Get source credentials
:param: None
:return: username & password
"""
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
username, password = get_eet_credentials()
# key = paramiko.RSAKey.from_private_key_file(openssh_key, password=password)
ssh.connect(hostname=SFTP_SERVER, port=SFTP_PORT_NUMBER,
username=username, password=password)
client = ssh.open_sftp()
print("Connection to source success!")
return client
Lastly, below is my airflow task:
def copy_from_source():
"""
Copy XML file from source to local path
"""
return SFTPOperator(
task_id="copy_from_source",
ssh_conn_id="source_conn",
local_filepath=f"{current_dir}/destination",
remote_filepath= "/source/",
prev_execution_date='{{ prev_execution_date }}',
execution_date='{{ execution_date }}', # strftime("%Y-%m-%d %H:%M:%S")
create_intermediate_dirs=True,
operation="get",
dag=dag
)
I'm trying to do something similar to you. I'm not sure what is causing the issues you are facing but this is the updated SFTP Operator I have written that gets multiple files from a server
sftp_get_multiple_files_operator.py
import os
from pathlib import Path
from typing import Any
from airflow.exceptions import AirflowException
from airflow.models import BaseOperator
from airflow.contrib.hooks import SSHHook
class SFTPGetMultipleFilesOperator(BaseOperator):
template_fields = ('local_directory', 'remote_filename_pattern', 'remote_host')
def __init__(
self,
*,
ssh_hook=None,
ssh_conn_id=None,
remote_host=None,
local_directory=None,
remote_filename_pattern=None,
filetype=None,
confirm=True,
create_intermediate_dirs=False,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.ssh_hook = ssh_hook
self.ssh_conn_id = ssh_conn_id
self.remote_host = remote_host
self.local_directory = local_directory
self.filetype = filetype
self.remote_filename_pattern = remote_filename_pattern
self.confirm = confirm
self.create_intermediate_dirs = create_intermediate_dirs
def execute(self, context: Any) -> str:
file_msg = None
try:
if self.ssh_conn_id:
if self.ssh_hook and isinstance(self.ssh_hook, SSHHook):
self.log.info("ssh_conn_id is ignored when ssh_hook is provided.")
else:
self.log.info(
"ssh_hook is not provided or invalid. Trying ssh_conn_id to create SSHHook."
)
self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id)
if not self.ssh_hook:
raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.")
if self.remote_host is not None:
self.log.info(
"remote_host is provided explicitly. "
"It will replace the remote_host which was defined "
"in ssh_hook or predefined in connection of ssh_conn_id."
)
self.ssh_hook.remote_host = self.remote_host
with self.ssh_hook.get_conn() as ssh_client:
sftp_client = ssh_client.open_sftp()
all_files = sftp_client.listdir()
self.log.info(f'Found {len(all_files)} files on server')
timestamp = context['ds_nodash']
filename_pattern = self.remote_filename_pattern + timestamp
# fetch all CSV files for the run date that match the filename pattern
matching_files = [f for f in all_files
if f.find(filename_pattern) != -1]
# if file type is specified filter matching files for the file type
if self.filetype is not None:
matching_files = [filename for filename in matching_files
if filename[-len(self.filetype):] == self.filetype]
self.log.info(f'Found {len(matching_files)} files with name including {filename_pattern}')
local_folder = os.path.dirname(self.local_directory)
if self.create_intermediate_dirs:
Path(local_folder).mkdir(parents=True, exist_ok=True)
for f in matching_files:
self.log.info(f"Starting to transfer from /{f} to {self.local_directory}/{f}")
sftp_client.get(f'/{f}', f'{self.local_directory}/{f}')
except Exception as e:
raise AirflowException(f"Error while transferring {file_msg}, error: {str(e)}")
return self.local_directory
def _make_intermediate_dirs(sftp_client, remote_directory) -> None:
"""
Create all the intermediate directories in a remote host
:param sftp_client: A Paramiko SFTP client.
:param remote_directory: Absolute Path of the directory containing the file
:return:
"""
if remote_directory == '/':
sftp_client.chdir('/')
return
if remote_directory == '':
return
try:
sftp_client.chdir(remote_directory)
except OSError:
dirname, basename = os.path.split(remote_directory.rstrip('/'))
_make_intermediate_dirs(sftp_client, dirname)
sftp_client.mkdir(basename)
sftp_client.chdir(basename)
return
dag.py
sftp_report = SFTPGetMultipleFilesOperator(
task_id=f"sftp_reports_to_gcs",
ssh_conn_id="sftp_connection",
local_directory=f'/opt/airflow/dags/reports',
remote_filename_pattern=f'reportname_', # ds_nodash is added in the operator by accessing Airflow context
create_intermediate_dirs=True,
filetype='.csv'
)
I am new to python and trying to understanding how to automate stuff. I have a folder in which 5 csv files get updated daily, however sometimes one of them or two dont on particular days. Im having to manually check this folder. Instead I want to automate this in such a way that if a csv file does not update in the last 24hours, It can send an email to myself alerting me of this.
My code:
import datetime
import glob
import os
import smtplib
import string
now = datetime.datetime.today() #Get current date
list_of_files = glob.glob('c:/Python/*.csv') # * means all if need specific format then *.csv
latest_file = max(list_of_files, key=os.path.getctime) #get latest file created in folder
newestFileCreationDate = datetime.datetime.utcfromtimestamp(os.path.getctime(latest_file)) # get creation datetime of last file
dif = (now - newestFileCreationDate) #calculating days between actual date and last creation date
logFile = "c:/Python/log.log" #defining a log file
def checkFolder(dif, now, logFile):
if dif > datetime.timedelta(days = 1): #Check if difference between today and last created file is greater than 1 days
HOST = "12.55.13.12" #This must be your smtp server ip
SUBJECT = "Alert! At least 1 day wthout a new file in folder xxxxxxx"
TO = "xx.t#gmail.com"
FROM = "xx.t#gmail.com"
text = "%s - The oldest file in folder it's %s old " %(now, dif)
BODY = string.join((
"From: %s" % FROM,
"To: %s" % TO,
"Subject: %s" % SUBJECT ,
"",
text
), "\r\n")
server = smtplib.SMTP(HOST)
server.sendmail(FROM, [TO], BODY)
server.quit()
file = open(logFile,"a") #Open log file in append mode
file.write("%s - [WARNING] The oldest file in folder it's %s old \n" %(now, dif)) #Write a log
file.close()
else : # If difference between today and last creation file is less than 1 days
file = open(logFile,"a") #Open log file in append mode
file.write("%s - [OK] The oldest file in folder it's %s old \n" %(now, dif)) #write a log
file.close()
checkFolder(dif,now,logFile) #Call function and pass 3 arguments defined before
However, this does not run without error and I just want to be notified by mail of those files in the folder that havent been updated. even if it is one of out 5 files of them or 5 out of 5 that havent updated.
Use pure python and concise way
import hashlib
import glob
import json
import smtplib
from email.message import EmailMessage
import time
import schedule #pip install schedule
hasher = hashlib.md5()
size = 65536 #to read large files in chunks
list_of_files = glob.glob('./*.csv') #absolute path for crontab
Part 1) Run this script first then comment it out. It will create a json file with hashes of your files.
first_hashes = {}
for x in list_of_files:
with open(x, 'rb') as f:
buf = f.read(size)
while len(buf) > 0:
hasher.update(buf)
buf = f.read(size)
first_hashes[x] = hasher.hexdigest()
with open('hash.json', 'w') as file:
file.write(json.dumps(first_hashes, indent=2))
Now comment it out or even delete it.
Part 2) Automation script:
def send_email():
check_hash = {} #Contain hashes that have not changed
with open('hash.json') as f: #absolute path for crontab
data = json.load(f)
for x in list_of_files:
with open(x, 'rb') as f:
buf = f.read(size)
while len(buf) > 0:
hasher.update(buf)
buf = f.read(size)
new_hash = hasher.hexdigest()
#if a hash match with one in data, that file has not changed
if new_hash in data.values():
check_hash[x] = new_hash
data[x] = new_hash
#update our hashes
with open('hash.json', 'w') as file: #absolute path for crontab
file.write(json.dumps(data, indent=2))
if len(check_hash) > 0: #check if there's anything in check_hash
filename="check_hash.txt" #absolute path for crontab
#write to a text file named "check_hash.txt"
with open(filename, 'w') as f: #absolute path for crontab
f.write(json.dumps(check_hash, indent=2))
# for gmail smtp setup watch youtu.be/JRCJ6RtE3xU
EMAIL_ADDRESS = 'SMTPAddress#gmail.com'
EMAIL_PASSWORD = 'SMTPPassWord'
msg = EmailMessage()
msg['Subject'] = 'Unupdated files'
msg['From'] = EMAIL_ADDRESS
msg['To'] = 'receive#gmail.com'
msg.set_content('These file(s) did not update:')
msg.add_attachment(open(filename, "r").read(), filename=filename)
with smtplib.SMTP_SSL('smtp.gmail.com', 465) as smtp:
smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
smtp.send_message(msg)
#for faster testing check other options here github.com/dbader/schedule
schedule.every().day.at("10:30").do(send_email)
while 1:
schedule.run_pending()
time.sleep(1)
EDIT: If you restart your pc, you will need to run this file again to restart schedule, to avoid that, you can use crontab as follows (learn how from youtu.be/j-KgGVbyU08):
# mm hh DOM MON DOW command
30 10 * * * python3 path-to-file/email-script.py #Linux
30 10 * * * python path-to-file/email-script.py #Windows
This will run the script everyday at 10:30 AM IF the pc is ON at that time. For faster testing (run every 1 minute) use:
* * * * * python3 path-to-file/email-script.py
NOTE: If you gonna use crontab, you MUST use absolute path for all file references and replace
schedule.every().day.at("10:30").do(send_email)
while 1:
schedule.run_pending()
time.sleep(1)
with
if __name__ == "__main__":
send_email()
Tested and it's working great!
Granted I don't know CSV but I would import time and using the format and time. Sleep function create a timer. What's good about time module is that you can configure it to set a value to a variable after time is up. SO maybe if you do that and put into an if statement, when the variable reaches a value, send the email.
Are you thinking of something like this?
import os
from datetime import datetime
import smtplib
import textwrap
def send_email_failure():
SERVER = "12.55.13.12" #This must be your smtp server ip
SUBJECT = "Alert! At least 1 day without a new file in folder xxxxxxx"
TO = "xx.t#gmail.com"
FROM = "xx.t#gmail.com"
TEXT = "%s - The oldest file in folder it's %sh old " %(datetime.now(), oldest_time_hour)
"""this is some test documentation in the function"""
message = textwrap.dedent("""\
From: %s
To: %s
Subject: %s
%s
""" % (FROM, ", ".join(TO), SUBJECT, TEXT))
print(message)
# Send the mail
server = smtplib.SMTP(SERVER)
server.sendmail(FROM, TO, message)
server.quit()
def save_log(logFile, ok_or_failure, time_now, delta):
file = open(logFile,"a") #Open log file in append mode
if ok_or_failure != 'ok':
file.write("%s - [WARNING] The oldest file in folder it's %s old \n" %(time_now, delta))
else:
file.write("%s - [OK] The oldest file in folder it's %s old \n" %(time_now, delta))
file.close()
def check_file(filename):
print(filename)
if filename.endswith('.csv'):
print('csv')
try:
mtime = os.path.getmtime(filename) # get modified time
except OSError:
mtime = 0
last_modified_date = datetime.fromtimestamp(mtime)
tdelta = datetime.now() - last_modified_date
hours = tdelta.seconds // 3600 # convert to hours
return hours
else:
return 0
# we check what files are in the dir 'files'
# and their modification time
oldest_time_hour = 0
for path, dirs, files in os.walk('./files'): # this need to be modified by case
for file in files:
# get each file time of modification
time = check_file(path+'/'+file)
if time > 0:
# save the oldest time
if time > oldest_time_hour:
oldest_time_hour = time
# if it is older that 24h
if oldest_time_hour > 24:
save_log('log.log', 'failure', datetime.now(), oldest_time_hour)
send_email_failure()
else:
save_log('log.log', 'ok', datetime.now(), oldest_time_hour)
also you will need an end-less loop to run the python script or a chronjob to run this python script every hour or so
Why are you checking the last_modified_date? I suggest you to check the modification of the file with md5 checksum.
My Idea is, if you have following files :
file1.csv
file2.csv
file3.csv
file4.csv
file5.csv
You can check their md5 checksum and write the result + DateTime into a file next to the original file. like following :
file1.csv
file1.csv_checksum
Content of file1.csv_checksum
timestamp,checksum
1612820511,d41d8cd98f00b204e9800998ecf8427e
you can check md5 of a file with following code:
>>> import hashlib
>>> hashlib.md5(open('filename.exe','rb').read()).hexdigest()
then you can check the result with the provided one in the checksum file ( and if the checksum file does not exist, just create it for the first time )
I think you can easily handle it with this approach.
At first i started with a task scheduler decorator which will enable you to poll a directory for a fixed delay:
import time
import functools
def scheduled(fixed_delay):
def decorator_scheduled(func):
functools.wraps(func)
def wrapper_schedule(*args, **kwargs):
result = func(*args, **kwargs)
self = args[0]
delay = getattr(self, fixed_delay)
time.sleep(delay)
return result
return wrapper_schedule
return decorator_scheduled
Saved it as a seperate module named task_scheduler.py.
I will use it in my file watcher:
import os
from task_scheduler import scheduled
import smtplib, ssl
class FileWatcher:
def __init__(self,
files_path='./myFiles',
extension='.csv',
poll_delay=2):
self.files_path = files_path
self.extension = extension
self.poll_delay = poll_delay
def notify_host_on_nonchange(self, file_path):
port = 465
smtp_server = "smtp.gmail.com"
sender_email = "sender#gmail.com"
receiver_email = "receiver#gmail.com"
password = "Your password here" #You may want to read it from file
message = f"No change in file: {file_path} for 24 hurs!"
context = ssl.create_default_context()
with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:
server.login(sender_email, password)
server.sendmail(sender_email, receiver_email, message)
def watch(self):
try:
while True:
self.poll_()
except KeyboardInterrupt:
log.debug('Polling interrupted by user.')
#scheduled("poll_delay")
def poll_(self,):
for f in os.listdir(self.files_path):
full_path = os.path.join(self.files_path, f)
path_stat = os.stat(full_path)
_, file_ext = os.path.splitext(f)
ctime = path_stat.st_ctime
diff = time.time() - ctime/3600
if diff<=24 or not S_ISREG(path_stat.st_mode) or str(file_ext) != self.extension:
continue
self.notify_host_on_nonchange(full_path)
if __name__ == "__main__":
file_listener = FileWatcher()
file_listener.watch()
Above class defines a poll_ function which benefits from os.stat module to check the modification time. If modification time smaller than or equal to 24 or the file is not a regular file (means that it is a directory) or it does not have the extension you look for polling will skip it, otherwise calls the notify function to send e-mail. It uses the gmail smtp server example but you can change it as appropriate for your environment. Watch function is a wrapper for continous polling.
This class is adapted from my machine learning model watcher and loader, you can access that version and project from my github. For further explanation about decorator and script you can check out my medium post.
I'm trying to make my own logging system because I don't like Python's one, but I ran into a small problem. Whenever I log two or more things, only the last log is written. Example - I log 'code initialized' at the start of the program. Then, I log 'exiting'. The result in the log file is only 'exiting' How do I fix this? Here is my code.
File 1.
from functions import *
from variables import *
try:
open('key.txt', "r")
key = True
except FileNotFoundError:
key = False
if check_internet():
x = status_boot[0]
elif not check_internet():
x = status_boot[1]
log("archer initialized")
if key:
print("archer status: " + x)
elif not key:
print(status_boot[4])
File 2.
from datetime import datetime
from variables import *
import requests
def time():
now = datetime.now()
return now.strftime("%H:%M:%S")
def check_internet():
timeout = 5
try:
_ = requests.get(url, timeout=timeout)
return True
except requests.ConnectionError:
return False
def log(m):
write = open('archerlog.txt', 'w+')
return write.write(f'[{time()}] archer: {m}\n')
File 3.
url = "http://www.google.com/"
status_boot = ["online", "offline", "key invalid", "error", "Error: Key not accessible!"]
x = status_boot[3]
I Have this Code... i want to see this directory on my ftp server. In that dir i have 3 files but my code reads only 2. And it throws an error which i handle. Take a look on the code and the output
import datetime
import ftplib
import os
errors = 0
default_ftp_name = 'username'
default_ftp_pass = 'pass'
host = 'ftp.example.com'
print('Connecting To ' + host)
try:
ftp = ftplib.FTP(host)
print('Successfully connected! ')
except:
print('[!]Failed To Connect')
errors += 1
print('Logging with: ' + default_ftp_name)
try:
ftp.login(default_ftp_name,default_ftp_pass)
print('Login Success')
except:
print('[!]Couldnt log in')
errors += 1
print('Changing Directory to /Public/ConnectedUsers/')
try:
ftp.cwd('/Public/ConnectedUsers/')
except:
print('[!]Directory failed to change')
errors += 1
try:
print('Retrieving Files...')
files = ftp.dir()
print(files)
except:
print('[!]Didnt Get The Files')
errors += 1
print('[t] Total Errors: ' + str(errors))
connection = False
if connection is True:
#Dosomehting
var = 10
else:
print('Connection Error')
See the output right here.
it shows 2 items but i have 3.
What do i need to change in order to access all files?
Take a look here
http://i.stack.imgur.com/TQDyv.png
ftp.dir prints the output to stdout, so files is currently None. You want to replace that call with ftp.nlst (or ftp.retrlines with a callback handler). See the docs.
The Problem, I made a rest api out of bottle to start Elasticsearch bulk loads. The bulk load process runs inside of a multiprocess pool, the problem is while that is running the api wont except any other commands.
I've even tried running another instance of the api on a different port but I have stack question about that too. The second one doesn't accept commands.
I want to be able to call a different api command to get the status of the load and return it. Currently it just includes ES data, but eventually its going to include each nodes stats. This is designed to run from Jenkins and initiate parallel loads.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright [current year] the Melange authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import RequestError
from subprocess import Popen, PIPE
from multiprocessing import Pool, Process, pool
from datetime import datetime
import boto3
import sys
import os
import argparse
import logging
import logging.config
from bottle import route, run
from boto.cloudformation.stack import Output
import json
#this is what is called to set up the loading process from the api.
def start_load(secret, access, protocol, host, ports, index, type, mapping, data,threads):
# decompress a gzip string
def decompress_gzip(data):
return Popen(['zcat'], stdout=PIPE, stdin=PIPE).communicate(input=data)[0]
# parse an s3 path into a bucket and key 's3://my-bucket/path/to/data' -> ('my-bucket', 'path/to/data')
def parse_s3_path(str):
_, _, bucket, key = str.split('/', 3)
return (bucket, key)
def shell_command_execute(command):
p = Popen(command, stdout=PIPE, shell=True)
(output, err) = p.communicate()
return output
# load an S3 file to elasticsearch
def load_s3_file(s3_bucket, s3_key, es_host, es_port, es_index, es_type, access, secret):
try:
logging.info('loading s3://%s/%s', s3_bucket, s3_key)
s3 = boto3.client('s3', aws_access_key_id=access, aws_secret_access_key=secret)
file_handle = s3.get_object(Bucket=s3_bucket, Key=s3_key)
file_contents = file_handle['Body'].read()
logging.info('%s'%s3_key)
if file_contents:
if s3_key.endswith('.gz'):
file_contents = decompress_gzip(file_contents)
es = Elasticsearch(host=es_host, port=es_port, timeout=180)
es.bulk(body=file_contents, index=es_index, doc_type=es_type, timeout=120)
except Exception as e:
logging.error("There has been a major error %s" % e)
# load an S3 file to elasticsearch
def load_single_s3_file(s3_bucket, s3_key, es_host, es_port, es_index, es_type, access, secret):
try:
logging.info('loading s3://%s/%s', s3_bucket, s3_key)
s3 = boto3.client('s3', aws_access_key_id=access, aws_secret_access_key=secret)
file_handle = s3.get_object(Bucket=s3_bucket, Key=s3_key)
file_contents = file_handle['Body'].read()
logging.info('%s'%s3_key)
if file_contents:
if s3_key.endswith('.gz'):
file_contents = decompress_gzip(file_contents)
es = Elasticsearch(host=es_host, port=es_port, timeout=180)
res = es.get(index="test-index", doc_type='tweet', id=1)
es.insert(body = file_contents, index = es_index, doc_type=es_type, timeout=120)
except Exception as e:
logging.error("There has been a major error %s" % e)
start = datetime.now()
es_url = protocol + '://' + host + ':' + str(ports) + '/' + index + '/' + type
es = Elasticsearch(host=host, port=ports, timeout=180)
# S3 file - https://boto3.readthedocs.org/en/latest/reference/services/s3.html#object
s3 = boto3.client('s3', aws_access_key_id=access, aws_secret_access_key=secret)
s3_bucket, s3_key = parse_s3_path(mapping)
file_handle = s3.get_object(Bucket=s3_bucket, Key=s3_key)
mapping = file_handle['Body'].read()
try:
es.indices.create(index=index, body=mapping)
except:
logging.error('index exist')
logging.info('starting to load %s to %s', data, es_url)
es.indices.put_settings({'index': {'refresh_interval': '-1'}}, index=index)
pool = Pool(processes=int(threads))
s3 = boto3.resource('s3', aws_access_key_id=access, aws_secret_access_key=secret)
s3_bucket, s3_key = parse_s3_path(data)
for file_summary in s3.Bucket(s3_bucket).objects.all():
if file_summary.key.startswith(s3_key):
pool.apply_async(load_s3_file, args=(s3_bucket, file_summary.key, host, ports, index, type, access, secret))
pool.close()
pool.join()
es.indices.put_settings({'index': {'refresh_interval': '1s'}}, index=index)
logging.info('finished loading %s to %s in %s', data, es_url, str(datetime.now() - start))
sys.exit(0)
#reset_es_settings(host, ports)
#This is what is called when no arguments are given
#route('/load_data/')
def no_comands():
return """Please include all nessecary values: example:
Start Load
http://127.0.0.1:8001/load_data/load&host=ip or DNS&thread=5&mappinglocation=tr-ips-ses-data|mappings|version_1_2|wos.mapping&datalocation=tr-ips-ses-data|json-data|wos|20150724|wos-1&port=9200&index=wos4&protocol=http&type=wos&access=access_key&secret=secret_key
Delete Index
http://127.0.0.1:8001/delete/wos4&host=ip or DNS&port=9200
with loading you must specify the load command as shown above
use & to seperate values
use = to seperate key value pairs
use | to insert \
"""
#route('/load_data/<name>', method='GET')
def commands( name="Execute Load" ):
values = name.split('&')
#split apart the url syntax items are split by & key values by = and any plcae that needs \ gets |
try:
command = values[0]
host = values[1] + ".us-west-2.elb.amazonaws.com"
threads = values[2]
mapping_location = values[3].replace('|', '/')
data_location = values[4].replace('|', '/')
#mapping_location = values[3]
#data_location = values[4]
ports = values[5]
index = values[6]
protocol = values[7]
type = values[8]
access = values[9]
secret = values[10]
host = host.split('=')[1]
threads = threads.split('=')[1]
mapping_location = "s3://" + mapping_location.split('=')[1]
data_location = "s3://" + data_location.split('=')[1]
ports = ports.split('=')[1]
index = index.split('=')[1]
protocol = protocol.split('=')[1]
types = type.split('=')[1]
access = access.split('=')[1]
secret = secret.split('=')[1]
yield ("Starting Load of data use /get_status/es_url&es_port&index to get the status of your load.")
start_load(secret, access, protocol, host, ports, index, types, mapping_location, data_location,threads)
except Exception as e:
logging.error(e)
yield """Please include all nessecary values: example:
Start Load
http://127.0.0.1:8001/load_data/load&host=ip or DNS&thread=5&mappinglocation=tr-ips-ses-data|mappings|version_1_2|wos.mapping&datalocation=tr-ips-ses-data|json-data|wos|20150724|wos-1&port=9200&index=wos4&protocol=http&type=wos&access=access_key&secret=secret_key
Delete Index
http://127.0.0.1:8001/delete/wos4&host=ip or DNS&port=9200
with loading you must specify the load command as shown above
use & to seperate values
use = to seperate key value pairs
use | to insert \
"""
#This is what is cvalled when /delete/ is used.
#route('/delete/<name>', method='GET' )
def recipe_delete( name="Delete Index" ):
def shell_command_execute(command):
p = Popen(command, stdout=PIPE, shell=True)
(output, err) = p.communicate()
return output
values = name.split('&')
try:
#split apart the url syntax items are split by & key values by |
index = values[0]
host = values[1] + ".us-west-2.elb.amazonaws.com"
host = host.split('=')[1]
port = values[2]
port = port.split('=')[1]
except Exception as e:
logging.error(e)
return """Please include all nessecary values: example:
Start Load
http://127.0.0.1:8001/load_data/load&host=ip or DNS&thread=5&mappinglocation=tr-ips-ses-data|mappings|version_1_2|wos.mapping&datalocation=tr-ips-ses-data|json-data|wos|20150724|wos-1&port=9200&index=wos4&protocol=http&type=wos&access=access_key&secret=secret_key
Delete Index
http://127.0.0.1:8001/delete/wos4&host=ip or DNS&port=9200
with loading you must specify the load command as shown above
use & to seperate values
use = to seperate key value pairs
use | to insert \
"""
try:
#This is the command that deletes the index.
curl_command = 'curl -XDELETE http://' + host + ':9200/' + index
shell_command_execute(curl_command)
return "Successfully Deleted Index"
except Exception as e:
logging.error(e)
return "Failed to Deleted Index %s" % e
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf8')
url = os.path.dirname(os.path.realpath(__file__)) + '/logging.ini'
print url
logging.config.fileConfig(url)
run(host='172.31.28.189', port=8001, debug=True)
#run(host='127.0.0.1', port=8001, debug=True)