I have a directory of 170,000+ pickle files in multiple subdirectories which were originally pickled using the (protocol=0) format. This hasn't been very efficient time or space-wise.
I wrote a script to re-pickle (using cPickle, protocol=2) each file in the folder(s) but curiously, the script throws an exception while processing a particular file (file # 95,000).
Initially, I thought that the pickle file is corrupted. When I try to load this exact pickle file from IPython command line, the file loads just fine.
So, I'm dumbfounded as to why this happens. Here's my script and I appreciate help:
import os
import cPickle
import numpy
import time
import re
from progressbar import ProgressBar
inpath = '/path/to/folder'
def list_files(dir):
r = []
subdirs = [x[0] for x in os.walk(dir)]
for subdir in subdirs:
files = os.walk(subdir).next()[2]
if (len(files) > 0):
for file in files:
r.append(subdir + "/" + file)
return r
infileList = list_files(inpath)
print "Total number of files found: %d" % len(infileList)
print "\n\n"
progress = ProgressBar()
outfilename = " "
print "Processing pickle files. Pls wait..."
t0 = time.time()
filecount = 0
for file in progress(infileList):
try:
arr = cPickle.load(open(file , "rb" ))
outfilename = re.sub('/initial/path/','/new/path/',file)
if not os.path.exists(os.path.dirname(outfilename)):
os.makedirs(os.path.dirname(outfilename))
with open(outfilename, "wb") as f:
cPickle.dump(arr,f,protocol=2)
filecount = filecount + 1
except Exception,e:
print "\n" + str(filecount)
print "\nError occured while processing file: " + outfilename
tx = time.time()
print "\n Time elapsed: %.2f" % (tx-t0)
continue
t1 = time.time()
total = t1-t0
print "Files repickled with protocol=2.\nRepickling execution time: %.2f sec" % total
Related
I'm a begynder in python and trying to make a script that does the following:
Check number of files, if they exist in the destFile
If they all exist, exit the script (don't do anything)
If some files are missing, copy only the missing files from the srcFile to the destFile
The script that I have made is working, but the issue that I would like your help with is to make my script only copies the file/files missing and not as my script is doing now, which copies from file 1 (test1.txt) to the file missing. Example if test4.txt & test5.txt files are missing in destFile, my script will copy from test1.txt to test5.txt, in stead of only copying the two missing files test4.txt & test5.txt.
import os, shutil
from datetime import datetime
count = 0
error = "ERROR! file is missing! (files have been copied)"
sttime = datetime.now().strftime('%d/%m/%Y - %H:%M:%S - ')
os.chdir("C:\log")
log = "log.txt"
srcFile = [r"C:\srcFile\test1.txt",
r"C:\srcFile\test2.txt",
r"C:\srcFile\test3.txt",
r"C:\srcFile\test4.txt",
r"C:\srcFile\test5.txt"]
destFile = [r"C:\destFile\test1.txt",
r"C:\destFile\test2.txt",
r"C:\destFile\test3.txt",
r"C:\destFile\test4.txt",
r"C:\destFile\test5.txt"]
for file in destFile:
if not os.path.exists(file):
for file_sr in srcFile:
if not os.path.exists(file):
shutil.copy(file_sr, 'C:\destFile')
count +=1
with open(log, 'a') as logfile:
logfile.write(sttime + error + " " + str(count) + " => " + file + '\n')
The problem is that you're iterating over all of the source files whenever you detect a missing destination file: for file_sr in srcFile:. Instead, you can copy just the missing file by keeping track of the position (in the array) of the missing destination file:
for position, file in enumerate(destFile):
if not os.path.exists(file):
file_sr = srcFile[position]
if not os.path.exists(file):
shutil.copy(file_sr, 'C:\destFile')
Using your code, you can do:
import os, shutil
from datetime import datetime
count = 0
error = "ERROR! file is missing! (files have been copied)"
sttime = datetime.now().strftime('%d/%m/%Y - %H:%M:%S - ')
os.chdir("C:\log")
log = "log.txt"
srcFile = [r"C:\srcFile\test1.txt",
r"C:\srcFile\test2.txt",
r"C:\srcFile\test3.txt",
r"C:\srcFile\test4.txt",
r"C:\srcFile\test5.txt"]
destFile = [r"C:\destFile\test1.txt",
r"C:\destFile\test2.txt",
r"C:\destFile\test3.txt",
r"C:\destFile\test4.txt",
r"C:\destFile\test5.txt"]
for file in destFile:
if not os.path.exists(file):
src_file = destFile.replace("destFile","srcFile")
shutil.copy(src_file, file)
count +=1
with open(log, 'a') as logfile:
logfile.write(sttime + error + " " + str(count) + " => " + file + '\n')
Thank you for your help guys. Exactly my problem was that I was iterating over all of the source files whenever I detected a missing destination file. The following logic from mackorone is doing what I was looking for.
for position, file in enumerate(destFile):
if not os.path.exists(file):
file_sr = srcFile[position]
shutil.copy(file_sr, 'C:\destFile')
I have updated the script, so now this script compares two folders, source folder and destination folder. If destination folder is missing files from the source folder, it will be copied. The script is working fine.
import os
import shutil
from datetime import datetime
sttime = datetime.now().strftime('%d/%m/%Y - %H:%M:%S - ')
error = "ERROR! file is missing! (files have been copied)"
des_path = 'C:\des_folder'
sr_path = 'C:\sr_folder'
des_folder = os.listdir(des_path)
sr_folder = os.listdir(sr_path)
count = 0
os.chdir("C:\log")
log = "log.txt"
def compare_folder(folder1,folder2):
files_in_sr_folder = set(sr_folder) - set(des_folder)
return files_in_sr_folder
files_missing = compare_folder(sr_folder,des_folder)
if len(files_missing) != 0:
for file in files_missing:
full_path_files = os.path.join(sr_path,file)
shutil.copy(full_path_files,des_path)
count +=1
with open(log, 'a') as logfile:
logfile.write(sttime + error + " " + str(count) + " => " + file + '\n')
else:
exit
My below code creates exception log files at location-
C:/Users/Desktop/SampleTestFiles/ProjectFiles/ExceptionLogFiles/
Initially code keeps writing into ExceptionLog_1.txt file whenever exception occurs and when the size of file exceeds 1 MB it starts writing to ExceptionLog_2.txt until its size is 1 MB. So far, it works perfect only for these 2 file creations and writing. When size of second file exceeds 1 MB it should log exceptions into a third log file ExceptionLog_3.txt. But, it does not works. Code keeps on writing into second file.
How to modify my code to make sure a new file is created when size of latest log file exceeds 1 MB?
def WriteExceptionToFile(self, traceback):
count = 1
fileDir = 'C:/Users/Desktop/SampleTestFiles/ProjectFiles/ExceptionLogFiles/'
# check if the path exists, create directory if not.
if not (os.path.exists):
os.mkdir(fileDir)
filename = "ExceptionLog_"+ str(count) +".txt"
filepath = os.path.join(fileDir, filename)
try:
if os.path.getsize(filepath) < 1048576: # if file size is less than 1 MB
filename = "ExceptionLog_" + str(count) + ".txt"
else:
filename = "ExceptionLog_" + str(count + 1) + ".txt"
except OSError:
Print("Path '%s' does not exists or is inaccessible" % filepath)
filename = "ExceptionLog_1.txt"
filepath = os.path.join(fileDir, filename)
with open(filepath, 'a+') as f:
traceback.print_exc(file=f)
f.close()
You could also try an approach using rotating files from the logging module.
Example directly from the documentation (https://docs.python.org/3/howto/logging-cookbook.html):
import glob
import logging
import logging.handlers
LOG_FILENAME = 'logging_rotatingfile_example.out'
# Set up a specific logger with our desired output level
my_logger = logging.getLogger('MyLogger')
my_logger.setLevel(logging.DEBUG)
# Add the log message handler to the logger, HERE YOU CAN SPECIFY THE FILE SIZE
handler = logging.handlers.RotatingFileHandler(
LOG_FILENAME, maxBytes=20, backupCount=5)
my_logger.addHandler(handler)
# Log some messages
for i in range(20):
my_logger.debug('i = %d' % i)
# See what files are created
logfiles = glob.glob('%s*' % LOG_FILENAME)
for filename in logfiles:
print(filename)
I would suggest you to go with using a class that way you wont have to worry about maintaining the correct count elsewhere.
Checkout the solution below
import os
class GenericExceptionWriter:
def __init__(self):
self.count = 1
self.fileDir = 'C:/Users/Desktop/SampleTestFiles/ProjectFiles/ExceptionLogFiles/'
os.makedirs(self.fileDir, exist_ok=True)
self.currentFilePath = "".join(["ExceptionLog_",str(self.count),".txt"])
self.maxSize = 1048576
def checkSize(self):
if os.path.getsize(self.currentFilePath) > self.maxSize:
self.count += 1
self.currentFilePath = "".join(["ExceptionLog_",str(self.count),".txt"])
def WriteExceptionToFile(self, traceback):
try:
self.checkSize()
except OSError:
print("Path '%s' does not exists or is inaccessible" % self.currentFilePath)
filepath = os.path.join(self.fileDir, self.currentFilePath)
with open(filepath, 'a+') as f:
traceback.print_exc(file=f)
f.close()
This is my code. I get the error when I try to execute this script
Error raise BadZipFile("File is not a zip file")
BadZipFile: File is not a zip file
This is my source directorypath
data_dir = r'L:\DataQA\Python Unzip Files\Source Zipped'
I have multiple zipped folders within ‘Source Zipped’(uncompressed) folder. The same code works when I zip all the subfolder of source Zipped into single zipped folder. But I don’t want this approach.
import os
import zipfile
import shutil
import json
import logging
import logging.config
import time
def my_start_time():
global start_time, cumulative_time, start_time_stamp
start_time = time.time()
this_time = time.localtime(start_time)
start_time_stamp = '{:4d}{:02d}{:02d} {:02d}:{:02d}:{:02d}'.format(\
this_time.tm_year, this_time.tm_mon, this_time.tm_mday,\
this_time.tm_hour, this_time.tm_min, this_time.tm_sec)
cumulative_time = start_time - start_time
logging.info('Initial Setup: {:s}'.format(start_time_stamp))
def my_time():
global cumulative_time
time_taken = time.time() - start_time
incremental_time = time_taken - cumulative_time
cumulative_time = time_taken
logging.info("Started: %s Complete: Cumulative: %.4f s Incremental: %.4f s\n" \
% (start_time_stamp, cumulative_time, incremental_time) )
logging.basicConfig(filename='myunzip_task_log.txt',level=logging.DEBUG)
my_start_time()
logging.info('Initial Setup...')
def write_to_json(data, file):
value = False
with open(file, 'w') as f:
json.dump(json.dumps(data, sort_keys=True),f)
f.close()
value = True
return value
data_dir = r'L:\DataQA\Python Unzip Files\Source Zipped'
temp_dir = r'L:\DataQA\Python Unzip Files\temp1'
new_dir = r'L:\DataQA\Python Unzip Files\temp2'
final_dir = r'L:\DataQA\Python Unzip Files\Destination Unzipped files'
big_list = os.listdir(data_dir)
archive_count = 0
file_count = 152865
basename1 = os.path.join(final_dir,'GENERIC_ROUGHDRAFT')
basename2 = os.path.join(final_dir,'XACTDOC')
my_time()
archive_count = len(big_list)
logging.info('Unzipping {} archives...'.format(archive_count))
for folder in big_list:
prior_count = file_count
logging.info('Starting: {}'.format(folder))
try:
shutil.rmtree(temp_dir)
except FileNotFoundError:
pass
os.mkdir(temp_dir)
with zipfile.ZipFile(os.path.join(data_dir,folder),mode='r') as a_zip:
a_zip.extractall(path = temp_dir)
archive_count += 1
logging.info('Cumulative total of {} archive(s) unzipped'.format(archive_count))
bigger_list = os.listdir(temp_dir)
logging.info('Current archive contains {} subfolders'.format(len(bigger_list)))
for sub_folder in bigger_list:
with zipfile.ZipFile(os.path.join(temp_dir,sub_folder),mode='r') as b_zip:
b_zip.extractall(path = new_dir)
file1 = "%s (%d).%s" % (basename1, file_count, 'xml')
file2 = "%s (%d).%s" % (basename2, file_count, 'xml')
shutil.copy(os.path.join(new_dir, 'GENERIC_ROUGHDRAFT.xml'), file1)
shutil.copy(os.path.join(new_dir, 'XACTDOC.xml'), file2)
file_count += 1
logging.info('{} subfolders unzipped'.format(file_count - prior_count))
#os.remove(data_dir)
shutil.rmtree(data_dir)
os.mkdir(data_dir)
#os.unlink(data_dir)
my_time()
logging.info('Total of {0} files -- {1} pairs -- should be in {2}'.format(2*(file_count-1), file_count-1, final_dir))
time.sleep(1)
my_time()
in both zip archive open statements:
with zipfile.ZipFile(os.path.join(data_dir,folder),mode='r')
and
with zipfile.ZipFile(os.path.join(temp_dir,sub_folder),mode='r')
nothing (at least nothing that we can check) guarantees that the file names you're passing are actually .zip files. It could be a directory, an already extracted file, some file that was already there...
I suggest that you check the file extension prior to extracting, for instance:
import fnmatch
zfn = os.path.join(temp_dir,sub_folder)
if fnmatch.fnmatch(zfn,"*.zip"):
with zipfile.ZipFile(zfn,mode='r') as whatever:
Some .zip files could be corrupt, but that's less likely. Also, if you wanted to extract .jar and other zip-structured files with a different extension, replace the fnmatch by
if zfn.lower().endswith(('.zip','.jar','.docx')):
Well i have 2 different scripts that i wrote
The first one is just getting an md5 hash from all files that are .exe
The other script is some agent who check's every 3 seconds if their is new files
in the directory .
now i need to make the agent check the files and also print every md5
this are my scripts :
import os, time
path_to_watch = "/root/PycharmProjects/untitled1"
before = dict ([(f, None) for f in os.listdir (path_to_watch)])
while 1:
time.sleep (3)
after = dict ([(f, None) for f in os.listdir (path_to_watch)])
added = [f for f in after if not f in before]
removed = [f for f in before if not f in after]
if added: print "Added: ", ", ".join (added)
if removed: print "Removed: ", ", ".join (removed)
before = after
And the second one who checks for md5
import glob
import os
import hashlib
work_path = '/root/PycharmProjects/untitled1/'
filenames = glob.glob("/root/PycharmProjects/untitled1/*.exe" )
if len(os.listdir(work_path)) > 0:
for filename in filenames:
with open(filename, 'rb') as inputfile:
data = inputfile.read()
print hashlib.md5(data).hexdigest()
else:
print '0'
Thanks for the help !
How about reducing the iteration from the hash generation, wrapping it into a function and call it when a new file is found:
import time
import glob
import os
import hashlib
def md5(filename):
with open(filename, 'rb') as inputfile:
data = inputfile.read()
print filename, hashlib.md5(data).hexdigest()
path_to_watch = "."
before = os.listdir(path_to_watch)
while 1:
time.sleep(3)
after = os.listdir(path_to_watch)
added = [f for f in after if not f in before]
removed = [f for f in before if not f in after]
if added:
print "Added: ", ", ".join(added)
for filename in added:
md5(filename)
if removed:
print "Removed: ", ", ".join(removed)
before = after
Also stripped some unnecessary dict stuff from the code.
I suggest you take it as a challenge to reduce the number of statements and the number of data transformations to a minimum while keeping the function of the script. At the same time it might be worth a look to the Python Style Guide ;)
I am reading first line of all the files in a directory, on local it works fine but on EMR this test is failing at stuck at around 200-300th file.
Also ps -eLF show increase of childs to 3000 even print in on 200th line.
It this some bug on EMR to read max bytes?
pydoop version
pydoop==0.12.0
import os
import sys
import shutil
import codecs
import pydoop.hdfs as hdfs
def prepare_data(hdfs_folder):
folder = "test_folder"
copies_count = 700
src_file = "file"
#1) create a folder
if os.path.exists(folder):
shutil.rmtree(folder)
os.makedirs(folder)
#2) create XXX copies of file in folder
for x in range(0, copies_count):
shutil.copyfile(src_file, folder+"/"+src_file+"_"+str(x))
#3) copy folder to hdfs
#hadoop fs -copyFromLocal test_folder/ /maaz/test_aa
remove_command = "hadoop fs -rmr "+ hdfs_folder
print remove_command
os.system(remove_command)
command = "hadoop fs -copyFromLocal "+folder+" "+ hdfs_folder
print command
os.system(command)
def main(hdfs_folder):
try:
conn_hdfs = hdfs.fs.hdfs()
if conn_hdfs.exists(hdfs_folder):
items_list = conn_hdfs.list_directory(hdfs_folder)
for item in items_list:
if not item["kind"] == "file":
continue
file_name = item["name"]
print "validating file : %s" % file_name
try:
file_handle = conn_hdfs.open_file(file_name)
file_line = file_handle.readline()
print file_line
file_handle.close()
except Exception as exp:
print '####Exception \'%s\' in reading file %s' % (str(exp), file_name)
file_handle.close()
continue
conn_hdfs.close()
except Exception as e:
print "####Exception \'%s\' in validating files!" % str(e)
if __name__ == '__main__':
hdfs_path = '/abc/xyz'
prepare_data(hdfs_path)
main(hdfs_path)
I suggest using the subprocess module for reading the first line instead of pydoop's conn_hdfs.open_file
import subprocess
cmd='hadoop fs -cat {f}|head -1'.format(f=file_name)
process=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
stdout, stderr=process.communicate()
if stderr!='':
file_line=stdout.split('\n')[0]
else:
print "####Exception '{e}' in reading file {f}".format(f=file_name,e=stdout)
continue