Python create sqlite DB - python

I have the current code for a sqlite db creation:
import storage
import os
import audiotools
def store_dir(d):
store = storage.HashStore()
for root, bar, files in os.walk(d):
for filename in files:
filename = root + '/' + filename
try:
store.store_file(filename)
print ("Stored %s% filename")
except audiotools.UnsupportedFile:
print ('Skipping unsupported file %s') % filename
except Exception, e:
print (e)
def main():
d = input('Enter the path to the music directory: ')
store_dir(d)
print ("Done.")
if __name__ == '__main__':
main()
When this code runs I get a syntax error msg. Please help !
Thanks in advance

There are a few things to address here.
First, this line:
print ('Skipping unsupported file %s') % filename
needs to be this:
print ('Skipping unsupported file %s' % filename)
Second, you need to use raw_input here:
d = input('Enter the path to the music directory: ')
which returns a string object, instead of input, which evaluates input as real Python code.
Third, your indentation is off. I'm pretty sure this is just a SO formatting error though.
Finally, you should use os.path.join here:
filename = root + '/' + filename
That isn't an error though, just a tip.
All in all, your code should look like this:
import storage
import os
import audiotools
def store_dir(d):
store = storage.HashStore()
for root, bar, files in os.walk(d):
for filename in files:
filename = os.path.join(root, filename)
try:
store.store_file(filename)
print ("Stored %s% filename")
except audiotools.UnsupportedFile:
print ('Skipping unsupported file %s' % filename)
except Exception, e:
print (e)
def main():
d = raw_input('Enter the path to the music directory: ')
store_dir(d)
print ("Done.")
if __name__ == '__main__':
main()

Related

python script throwing an error that i can deal with

I have created the above script to remove unwanted genes from a database. It does work quite well if you only use it once but I re-run it, it giving me an error message such as:
shutil.Error: Destination path 'path/rejected_database_genes/gene_A.fa' already exists
And obviously it stops. I think that the problem arises when there are two or more genes in the file that needs to be removed.
Kind regards, and thanks for your help.
import glob, sys, os, shutil
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse
def help_function():
print 'Hi'
parser = argparse.ArgumentParser()
parser.add_argument('-input_file', '-i',type=str,help='path_to_data')
opts = parser.parse_args()
def check_file_exists(filepath, file_description):
if not os.path.exists(filepath):
print("The " + file_description + " (" + filepath + ") does not exist")
sys.exit(1)
else:
print file_description + " detected"
def remove_empty_files(alleles_files,destination):
input_handle=open(alleles_files, 'r')
gene_records=list(SeqIO.parse(input_handle, 'fasta'))
for gene_record in gene_records:
#filename=gene_record.id[0]
#count=0
if len(gene_record.seq)<5 or 'N'in gene_record.seq:
print gene_record.id
elif '-' in gene_record.seq:
print gene_record.id
#count+=1
shutil.move(alleles_files, destination)
def main():
destination=opts.input_file + '/rejected_database_genes'
if os.path.exists(destination):
print 'Folder already exits'
else:
os.makedirs(destination)
print 'Folder has been created'
files=glob.glob(opts.input_file+'/*.fa')
#print files
#sys.exit()
for f in files:
#print f
#sys.exit()
alleles_files=glob.glob(f)[0]
#print alleles_files
#sys.exit()
remove_empty_files(alleles_files,destination)
print 'Files have been removed'
main()
the problem you have is in the shutil.move line - if you will specify full path to source and destination this will overwrite the existing file and you will not get this error, if you don't want to overwrite and need both of the files, just rename the destination file to something else.
What I wanted was that the moment the script found to remove that file, and storage somewhere else so I can check it later. The problem I was having was that if in the same file there were two unwanted seq, it would through me and error telling me that the file already existed on destination, and stopped. So I managed to solve this problem by added an if statement. The corrected script is the one below:
import glob, sys, os, shutil
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse
def help_function():
print 'Hi'
parser = argparse.ArgumentParser()
parser.add_argument('-input_file', '-i',type=str,help='path_to_data')
opts = parser.parse_args()
def check_file_exists(filepath, file_description):
if not os.path.exists(filepath):
print("The " + file_description + " (" + filepath + ") does not exist")
sys.exit(1)
else:
print file_description + " detected"
def remove_empty_files(alleles_files,destination):
input_handle=open(alleles_files, 'r')
gene_records=list(SeqIO.parse(input_handle, 'fasta'))
geneID_list=[]
for gene_record in gene_records:
filename=gene_record.id.split('_')
geneID=filename[0]+'_'+filename[1]
if len(gene_record.seq)<5 or 'N'in gene_record.seq:
geneID_list.append(geneID)
shutil.move(alleles_files, destination)
print geneID_list
#break
if '-' in gene_record.seq:
geneID_list.append(geneID)
shutil.move(alleles_files, destination)
print geneID_list
#break
if len(geneID_list) >0:
break
def main():
if len(sys.argv) <=1:
parser.print_help()
sys.exit()
else:
check_file_exists(opts.input_file, 'input_file')
destination=opts.input_file + '/rejected_database_genes'
if os.path.exists(destination):
print 'Folder already exits'
else:
os.makedirs(destination)
print 'Folder has been created'
files=glob.glob(opts.input_file+'/*.fa')
#print files
#sys.exit()
for f in files:
#print f
#sys.exit()
alleles_files=glob.glob(f)[0]
#print alleles_files
#sys.exit()
remove_empty_files(alleles_files,destination)
print 'Files have been removed'
main()
By adding the third "if" statement, the file is removed as soon as it finds an unwanted sequence, remove the file to destination, and move to check the next file.

How to encrypt multiple files using python

I am trying to search for .txt files in a specified folder and encrypt each one of the .txt files found using my encryption algorithms. However I cannot seem to be able to figure out how to encrypt all the .txt files found within the folder and rename them
this is the code I am working with currently
import time, os, sys, encrypt, decrypt, caesarCipher, reverseCipher, vigenereCipher, glob
def main():
outputFilename = 'ABC.encrypted.txt'
mKey = 5
myMode = 'encrypt'
for root, dirs, files in os.walk('/Ransom'):
for file in files:
if file.endswith((".txt")):
inputFilename = os.path.join(root, file)
if not os.path.exists(inputFilename):
print('The file %s does not exist. Exiting....' % (inputFilename))
sys.exit()
fileObj = open(inputFilename)
content = fileObj.read()
fileObj.close()
print ('%sing...' % (myMode.title()))
startTime = time.time()
if myMode == 'encrypt':
translated = encrypt.encryptMess(mKey, content, myMode)
elif myMode == 'decrypt':
translated = decrypt.decryptMess(mKey, content, myMode)
outputFileObj = open(outputFilename, 'w')
outputFileObj.write(translated)
outputFileObj.close()
print('Done %sing %s (%s characters).' % (myMode, inputFilename, len(content)))
print('%sed file is %s.' % (myMode.title(), outputFilename))
if __name__ == '__main__':
main()
I really appreciate any help to guide me into achieving this.
This code iterates over all the files in a given folder and calls a designated method whenever the file is '*.txt'
import os
baseUrl = './'
def encryptFile(filename):
# process one file here
print baseUrl + filename
alist = next(os.walk(baseUrl))[2]
for i in xrange(len(alist)):
afile = alist[i]
if afile[-4:] == '.txt':
encryptFile(afile)

Pydoop stucks on readline from HDFS files

I am reading first line of all the files in a directory, on local it works fine but on EMR this test is failing at stuck at around 200-300th file.
Also ps -eLF show increase of childs to 3000 even print in on 200th line.
It this some bug on EMR to read max bytes?
pydoop version
pydoop==0.12.0
import os
import sys
import shutil
import codecs
import pydoop.hdfs as hdfs
def prepare_data(hdfs_folder):
folder = "test_folder"
copies_count = 700
src_file = "file"
#1) create a folder
if os.path.exists(folder):
shutil.rmtree(folder)
os.makedirs(folder)
#2) create XXX copies of file in folder
for x in range(0, copies_count):
shutil.copyfile(src_file, folder+"/"+src_file+"_"+str(x))
#3) copy folder to hdfs
#hadoop fs -copyFromLocal test_folder/ /maaz/test_aa
remove_command = "hadoop fs -rmr "+ hdfs_folder
print remove_command
os.system(remove_command)
command = "hadoop fs -copyFromLocal "+folder+" "+ hdfs_folder
print command
os.system(command)
def main(hdfs_folder):
try:
conn_hdfs = hdfs.fs.hdfs()
if conn_hdfs.exists(hdfs_folder):
items_list = conn_hdfs.list_directory(hdfs_folder)
for item in items_list:
if not item["kind"] == "file":
continue
file_name = item["name"]
print "validating file : %s" % file_name
try:
file_handle = conn_hdfs.open_file(file_name)
file_line = file_handle.readline()
print file_line
file_handle.close()
except Exception as exp:
print '####Exception \'%s\' in reading file %s' % (str(exp), file_name)
file_handle.close()
continue
conn_hdfs.close()
except Exception as e:
print "####Exception \'%s\' in validating files!" % str(e)
if __name__ == '__main__':
hdfs_path = '/abc/xyz'
prepare_data(hdfs_path)
main(hdfs_path)
I suggest using the subprocess module for reading the first line instead of pydoop's conn_hdfs.open_file
import subprocess
cmd='hadoop fs -cat {f}|head -1'.format(f=file_name)
process=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
stdout, stderr=process.communicate()
if stderr!='':
file_line=stdout.split('\n')[0]
else:
print "####Exception '{e}' in reading file {f}".format(f=file_name,e=stdout)
continue

Using variables recursively in python

Can i use variables to set my zip patch inset of entering it manualy
Example part of the code that works fine
if __name__ == '__main__':
zip_folder(r'Monday' ,
r'Monday.zip')
But can i use a variable insted of just a entering the day myself, for this second example i get a "invalid syntax" error
today = "Monday"
today_zip = "Monday.zip"
if __name__ == '__main__':
zip_folder(r today,
r today_zip)
import zipfile
import sys
import os
def zip_folder(folder_path, output_path):
"""Zip the contents of an entire folder (with that folder included
in the archive). Empty subfolders will be included in the archive
as well.
"""
parent_folder = os.path.dirname(folder_path)
# Retrieve the paths of the folder contents.
contents = os.walk(folder_path)
try:
zip_file = zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED)
for root, folders, files in contents:
# Include all subfolders, including empty ones.
for folder_name in folders:
absolute_path = os.path.join(root, folder_name)
relative_path = absolute_path.replace(parent_folder + '\\',
'')
print "Adding '%s' to archive." % absolute_path
zip_file.write(absolute_path, relative_path)
for file_name in files:
absolute_path = os.path.join(root, file_name)
relative_path = absolute_path.replace(parent_folder + '\\',
'')
print "Adding '%s' to archive." % absolute_path
zip_file.write(absolute_path, relative_path)
print "'%s' created successfully." % output_path
except IOError, message:
print message
sys.exit(1)
except OSError, message:
print message
sys.exit(1)
except zipfile.BadZipfile, message:
print message
sys.exit(1)
finally:
zip_file.close()
if __name__ == '__main__':
zip_folder(r'Monday',
r'Monday.zip')
You do not need to specify r here:
if __name__ == '__main__':
zip_folder( today, today_zip)
would work fine. r,u etc are qualifiers for strings in python, which is not needed here in your case.

Script that reads PDF metadata and writes to CSV

I wrote a script to read PDF metadata to ease a task at work. The current working version is not very usable in the long run:
from pyPdf import PdfFileReader
BASEDIR = ''
PDFFiles = []
def extractor():
output = open('windoutput.txt', 'r+')
for file in PDFFiles:
try:
pdf_toread = PdfFileReader(open(BASEDIR + file, 'r'))
pdf_info = pdf_toread.getDocumentInfo()
#print str(pdf_info) #print full metadata if you want
x = file + "~" + pdf_info['/Title'] + " ~ " + pdf_info['/Subject']
print x
output.write(x + '\n')
except:
x = file + '~' + ' ERROR: Data missing or corrupt'
print x
output.write(x + '\n')
pass
output.close()
if __name__ == "__main__":
extractor()
Currently, as you can see, I have to manually input the working directory and manually populate the list of PDF files. It also just prints out the data in the terminal in a format that I can copy/paste/separate into a spreadsheet.
I'd like the script to work automatically in whichever directory I throw it in and populate a CSV file for easier use. So far:
from pyPdf import PdfFileReader
import csv
import os
def extractor():
basedir = os.getcwd()
extension = '.pdf'
pdffiles = [filter(lambda x: x.endswith('.pdf'), os.listdir(basedir))]
with open('pdfmetadata.csv', 'wb') as csvfile:
for f in pdffiles:
try:
pdf_to_read = PdfFileReader(open(f, 'r'))
pdf_info = pdf_to_read.getDocumentInfo()
title = pdf_info['/Title']
subject = pdf_info['/Subject']
csvfile.writerow([file, title, subject])
print 'Metadata for %s written successfully.' % (f)
except:
print 'ERROR reading file %s.' % (f)
#output.writerow(x + '\n')
pass
if __name__ == "__main__":
extractor()
In its current state it seems to just prints a single error (as in, the error message in the exception, not an error returned by Python) message and then stop. I've been staring at it for a while and I'm not really sure where to go from here. Can anyone point me in the right direction?
writerow([file, title, subject]) should be writerow([f, title, subject])
You can use sys.exc_info() to print the details of your error
http://docs.python.org/2/library/sys.html#sys.exc_info
Did you check the pdffiles variable contains what you think it does? I was getting a list inside a list... so maybe try:
for files in pdffiles:
for f in files:
#do stuff with f
I personally like glob. Notice I add * before the .pdf in the extension variable:
import os
import glob
basedir = os.getcwd()
extension = '*.pdf'
pdffiles = glob.glob(os.path.join(basedir,extension)))
Figured it out. The script I used to download the files was saving the files with '\r\n' trailing after the file name, which I didn't notice until I actually ls'd the directory to see what was up. Thanks for everyone's help.

Categories