I've got a small script with monitors when files are added or removed to a directory. The next step is for me to get the script to execute the files (windows batch files) once they’ve been added to the directory. I’m struggling to understand how to use a variable with subprocess call (if this is the best way this can be acheived). Could anyone help me please? Many thanks. Code looks like this so far ;
import sys
import time
import os
inputdir = 'c:\\test\\'
os.chdir(inputdir)
contents = os.listdir(inputdir)
count = len(inputdir)
dirmtime = os.stat(inputdir).st_mtime
while True:
newmtime = os.stat(inputdir).st_mtime
if newmtime != dirmtime:
dirmtime = newmtime
newcontents = os.listdir(inputdir)
added = set(newcontents).difference(contents)
if added:
print "These files added: %s" %(" ".join(added))
import subprocess
subprocess.call(%,shell=True)
removed = set(contents).difference(newcontents)
if removed:
print "These files removed: %s" %(" ".join(removed))
contents = newcontents
time.sleep(15)
This should do what you wanted, cleaned it up a little.
import sys
import time
import os
import subprocess
def monitor_execute(directory):
dir_contents = os.listdir(directory)
last_modified = os.stat(directory).st_mtime
while True:
time.sleep(15)
modified = os.stat(directory).st_mtime
if last_modified == modified:
continue
last_modified = modified
current_contents = os.listdir(directory)
new_files = set(current_contents).difference(dir_contents)
if new_files:
print 'Found new files: %s' % ' '.join(new_files)
for new_file in new_files:
subprocess.call(new_file, shell=True)
lost_files = set(dir_contents).difference(current_contents)
if lost_files:
print 'Lost these files: %s' % ' '.join(lost_files)
dir_contents = current_contents
Related
My code was working just fine before adding the hash function. I was getting the list of all folders and files in my directory in the Pretty Table. Once I added the hash function, I got maybe 5 of the files in that directory with hashes in the table. I am not sure where I have gone wrong. Please forgive me, I am new to this. We are not learning to code from scratch, but have to modify existing codes to function the way we need it to.
# Python Standard Libaries
import os #file system methode
import hashlib #hashing function
import sys #system methods
import time #time conversions
# Python 3rd Party Libraries
from prettytable import PrettyTable # pip install prettytable
# Local Functions
def GetFileMetaData(fileName):
#obtain file system metadata
try:
metaData = os.stat(fileName) # Use the stat method to obtain meta data
fileSize = metaData.st_size # Extract fileSize and MAC Times
timeLastAccess = metaData.st_atime
timeLastModified = metaData.st_mtime
timeCreated = metaData.st_ctime
macTimeList = [timeLastModified, timeCreated, timeLastAccess] # Group the MAC Times in a List
return True, None, fileSize, macTimeList
except Exception as err:
return False, str(err), None, None
# Psuedo Constants
# Start of the Script
tbl = PrettyTable(['FilePath','FileSize','UTC-Modified', 'UTC-Accessed', 'UTC-Created', 'SHA-256 HASH'])
#file check
while True:
targetFolder = input("Enter Target Folder: ")
if os.path.isdir(targetFolder):
break
else:
print("\nInvalid Folder ... Please Try Again")
print("Walking: ", targetFolder, "\n")
print()
for currentRoot, dirList, fileList in os.walk(targetFolder):
for nextFile in fileList:
fullPath = os.path.join(currentRoot, nextFile)
absPath = os.path.abspath(fullPath)
fileSize = os.path.getsize(absPath)
success, errInfo, fileSize, macList = GetFileMetaData(absPath)
if success:
#convert to readable Greenich Time
modTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[0]))
accTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[1]))
creTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[2]))
#hashing function
with open(absPath, 'rb') as target:
fileContents = target.read()
sha256Obj = hashlib.sha256()
sha256Obj.update(fileContents)
hexDigest = sha256Obj.hexdigest()
tbl.add_row( [ absPath, fileSize,modTime, accTime, creTime, hexDigest] )
tbl.align = "l" # align the columns left justified
# display the table
print (tbl.get_string(sortby="FileSize", reversesort=True))
print("\nScript-End\n")
I have created the above script to remove unwanted genes from a database. It does work quite well if you only use it once but I re-run it, it giving me an error message such as:
shutil.Error: Destination path 'path/rejected_database_genes/gene_A.fa' already exists
And obviously it stops. I think that the problem arises when there are two or more genes in the file that needs to be removed.
Kind regards, and thanks for your help.
import glob, sys, os, shutil
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse
def help_function():
print 'Hi'
parser = argparse.ArgumentParser()
parser.add_argument('-input_file', '-i',type=str,help='path_to_data')
opts = parser.parse_args()
def check_file_exists(filepath, file_description):
if not os.path.exists(filepath):
print("The " + file_description + " (" + filepath + ") does not exist")
sys.exit(1)
else:
print file_description + " detected"
def remove_empty_files(alleles_files,destination):
input_handle=open(alleles_files, 'r')
gene_records=list(SeqIO.parse(input_handle, 'fasta'))
for gene_record in gene_records:
#filename=gene_record.id[0]
#count=0
if len(gene_record.seq)<5 or 'N'in gene_record.seq:
print gene_record.id
elif '-' in gene_record.seq:
print gene_record.id
#count+=1
shutil.move(alleles_files, destination)
def main():
destination=opts.input_file + '/rejected_database_genes'
if os.path.exists(destination):
print 'Folder already exits'
else:
os.makedirs(destination)
print 'Folder has been created'
files=glob.glob(opts.input_file+'/*.fa')
#print files
#sys.exit()
for f in files:
#print f
#sys.exit()
alleles_files=glob.glob(f)[0]
#print alleles_files
#sys.exit()
remove_empty_files(alleles_files,destination)
print 'Files have been removed'
main()
the problem you have is in the shutil.move line - if you will specify full path to source and destination this will overwrite the existing file and you will not get this error, if you don't want to overwrite and need both of the files, just rename the destination file to something else.
What I wanted was that the moment the script found to remove that file, and storage somewhere else so I can check it later. The problem I was having was that if in the same file there were two unwanted seq, it would through me and error telling me that the file already existed on destination, and stopped. So I managed to solve this problem by added an if statement. The corrected script is the one below:
import glob, sys, os, shutil
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse
def help_function():
print 'Hi'
parser = argparse.ArgumentParser()
parser.add_argument('-input_file', '-i',type=str,help='path_to_data')
opts = parser.parse_args()
def check_file_exists(filepath, file_description):
if not os.path.exists(filepath):
print("The " + file_description + " (" + filepath + ") does not exist")
sys.exit(1)
else:
print file_description + " detected"
def remove_empty_files(alleles_files,destination):
input_handle=open(alleles_files, 'r')
gene_records=list(SeqIO.parse(input_handle, 'fasta'))
geneID_list=[]
for gene_record in gene_records:
filename=gene_record.id.split('_')
geneID=filename[0]+'_'+filename[1]
if len(gene_record.seq)<5 or 'N'in gene_record.seq:
geneID_list.append(geneID)
shutil.move(alleles_files, destination)
print geneID_list
#break
if '-' in gene_record.seq:
geneID_list.append(geneID)
shutil.move(alleles_files, destination)
print geneID_list
#break
if len(geneID_list) >0:
break
def main():
if len(sys.argv) <=1:
parser.print_help()
sys.exit()
else:
check_file_exists(opts.input_file, 'input_file')
destination=opts.input_file + '/rejected_database_genes'
if os.path.exists(destination):
print 'Folder already exits'
else:
os.makedirs(destination)
print 'Folder has been created'
files=glob.glob(opts.input_file+'/*.fa')
#print files
#sys.exit()
for f in files:
#print f
#sys.exit()
alleles_files=glob.glob(f)[0]
#print alleles_files
#sys.exit()
remove_empty_files(alleles_files,destination)
print 'Files have been removed'
main()
By adding the third "if" statement, the file is removed as soon as it finds an unwanted sequence, remove the file to destination, and move to check the next file.
So, i wrote this to monitor a folder for new pictures and print any that are found. It works, but I am assuming there is a more robust/efficient way to tackle this problem as I want it to run for 5-6 hours at a time.
My main problem is that I don't like using "open" while loops like this....
Would anyone tackle this differently? If so, would anyone be willing to explain?
import os
import glob
import win32com.client
import time
from pywinauto.findwindows import find_window
from pywinauto.win32functions import SetForegroundWindow
printed = []
i = 10
while i < 1000000000000000:
files = glob.glob("C://Users//pictures/*.jpg")
for filename in files:
print filename
try:
if printed.index(str(filename)) >= 0:
print printed.index(filename)
print "Image found"
except ValueError:
printed.append(filename)
os.startfile(filename, "print")
shell = win32com.client.Dispatch("WScript.Shell")
time.sleep(2)
SetForegroundWindow(find_window(title='Print Pictures'))
shell.AppActivate("Print Pictures")
shell.SendKeys("{ENTER}")
i = i + 1
time.sleep(5)
link below is related post. instead of using a long while loop you can use a watcher to trigger your operation.
How to detect new or modified files
Big thanks to scope for his comment, i have added my printing lines to the example and it works well. Code posted below for anyone who wants it, commented code is in the link code posted. Now to tidy up a few other things....
import os
import win32file
import win32event
import win
import glob
import win32com.client
import time
from pywinauto.findwindows import find_window
from pywinauto.win32functions import SetForegroundWindow
def print_photo(filename):
print filename
filename = path_to_watch +"\\" + filename[0]
os.startfile(filename, "print")
shell = win32com.client.Dispatch("WScript.Shell")
time.sleep(2)
SetForegroundWindow(find_window(title='Print Pictures'))
shell.AppActivate("Print Pictures")
shell.SendKeys("{ENTER}")
path_to_watch = os.path.abspath ("C:\\Users\\Ciaran\\Desktop\\")
change_handle = win32file.FindFirstChangeNotification (
path_to_watch,
0,
win32con.FILE_NOTIFY_CHANGE_FILE_NAME
)
try:
old_path_contents = dict ([(f, None) for f in os.listdir (path_to_watch)])
while 1:
result = win32event.WaitForSingleObject (change_handle, 500)
if result == win32con.WAIT_OBJECT_0:
new_path_contents = dict ([(f, None) for f in os.listdir (path_to_watch)])
added = [f for f in new_path_contents if not f in old_path_contents]
print_photo(added)
deleted = [f for f in old_path_contents if not f in new_path_contents]
if added: print "Added: ", ", ".join (added)
if deleted: print "Deleted: ", ", ".join (deleted)
old_path_contents = new_path_contents
win32file.FindNextChangeNotification (change_handle)
finally:
win32file.FindCloseChangeNotification (change_handle)
I am reading first line of all the files in a directory, on local it works fine but on EMR this test is failing at stuck at around 200-300th file.
Also ps -eLF show increase of childs to 3000 even print in on 200th line.
It this some bug on EMR to read max bytes?
pydoop version
pydoop==0.12.0
import os
import sys
import shutil
import codecs
import pydoop.hdfs as hdfs
def prepare_data(hdfs_folder):
folder = "test_folder"
copies_count = 700
src_file = "file"
#1) create a folder
if os.path.exists(folder):
shutil.rmtree(folder)
os.makedirs(folder)
#2) create XXX copies of file in folder
for x in range(0, copies_count):
shutil.copyfile(src_file, folder+"/"+src_file+"_"+str(x))
#3) copy folder to hdfs
#hadoop fs -copyFromLocal test_folder/ /maaz/test_aa
remove_command = "hadoop fs -rmr "+ hdfs_folder
print remove_command
os.system(remove_command)
command = "hadoop fs -copyFromLocal "+folder+" "+ hdfs_folder
print command
os.system(command)
def main(hdfs_folder):
try:
conn_hdfs = hdfs.fs.hdfs()
if conn_hdfs.exists(hdfs_folder):
items_list = conn_hdfs.list_directory(hdfs_folder)
for item in items_list:
if not item["kind"] == "file":
continue
file_name = item["name"]
print "validating file : %s" % file_name
try:
file_handle = conn_hdfs.open_file(file_name)
file_line = file_handle.readline()
print file_line
file_handle.close()
except Exception as exp:
print '####Exception \'%s\' in reading file %s' % (str(exp), file_name)
file_handle.close()
continue
conn_hdfs.close()
except Exception as e:
print "####Exception \'%s\' in validating files!" % str(e)
if __name__ == '__main__':
hdfs_path = '/abc/xyz'
prepare_data(hdfs_path)
main(hdfs_path)
I suggest using the subprocess module for reading the first line instead of pydoop's conn_hdfs.open_file
import subprocess
cmd='hadoop fs -cat {f}|head -1'.format(f=file_name)
process=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
stdout, stderr=process.communicate()
if stderr!='':
file_line=stdout.split('\n')[0]
else:
print "####Exception '{e}' in reading file {f}".format(f=file_name,e=stdout)
continue
I wrote a script to read PDF metadata to ease a task at work. The current working version is not very usable in the long run:
from pyPdf import PdfFileReader
BASEDIR = ''
PDFFiles = []
def extractor():
output = open('windoutput.txt', 'r+')
for file in PDFFiles:
try:
pdf_toread = PdfFileReader(open(BASEDIR + file, 'r'))
pdf_info = pdf_toread.getDocumentInfo()
#print str(pdf_info) #print full metadata if you want
x = file + "~" + pdf_info['/Title'] + " ~ " + pdf_info['/Subject']
print x
output.write(x + '\n')
except:
x = file + '~' + ' ERROR: Data missing or corrupt'
print x
output.write(x + '\n')
pass
output.close()
if __name__ == "__main__":
extractor()
Currently, as you can see, I have to manually input the working directory and manually populate the list of PDF files. It also just prints out the data in the terminal in a format that I can copy/paste/separate into a spreadsheet.
I'd like the script to work automatically in whichever directory I throw it in and populate a CSV file for easier use. So far:
from pyPdf import PdfFileReader
import csv
import os
def extractor():
basedir = os.getcwd()
extension = '.pdf'
pdffiles = [filter(lambda x: x.endswith('.pdf'), os.listdir(basedir))]
with open('pdfmetadata.csv', 'wb') as csvfile:
for f in pdffiles:
try:
pdf_to_read = PdfFileReader(open(f, 'r'))
pdf_info = pdf_to_read.getDocumentInfo()
title = pdf_info['/Title']
subject = pdf_info['/Subject']
csvfile.writerow([file, title, subject])
print 'Metadata for %s written successfully.' % (f)
except:
print 'ERROR reading file %s.' % (f)
#output.writerow(x + '\n')
pass
if __name__ == "__main__":
extractor()
In its current state it seems to just prints a single error (as in, the error message in the exception, not an error returned by Python) message and then stop. I've been staring at it for a while and I'm not really sure where to go from here. Can anyone point me in the right direction?
writerow([file, title, subject]) should be writerow([f, title, subject])
You can use sys.exc_info() to print the details of your error
http://docs.python.org/2/library/sys.html#sys.exc_info
Did you check the pdffiles variable contains what you think it does? I was getting a list inside a list... so maybe try:
for files in pdffiles:
for f in files:
#do stuff with f
I personally like glob. Notice I add * before the .pdf in the extension variable:
import os
import glob
basedir = os.getcwd()
extension = '*.pdf'
pdffiles = glob.glob(os.path.join(basedir,extension)))
Figured it out. The script I used to download the files was saving the files with '\r\n' trailing after the file name, which I didn't notice until I actually ls'd the directory to see what was up. Thanks for everyone's help.