Fail to check files integrity - python

I'm writing a quick Python script to migrate files from one directory to another. Everything works out perfectly except for the part where I have to compare the filenames and checksum. I stored the file in both locations, that is, /root/src/file1 and /root/dst/file1.
So when I ran for filenames compare, it failed to match the file since it included the whole filepath. md5Srt is a dict that stores files and checksum.
Is there a way that I can compare the filenames without using the whole filepath?
for key in md5Srt.keys():
if key in md5Dst:
print "keys match " + key
print '\ncomparing the values of files\n'
if md5Srt[key] == md5Dst[key]:
print md5Srt[key]
print md5Dst[key]
print "files match\n"
print "checking the next pair"
else:
print "values of files don't match"

If you just have a bunch of files in a directory you can just use os.path.basename:
import os
>>> dst = os.path.basename('/root/dst/file1.file')
>>> src = os.path.basename('/root/src/file1.file')
>>> dst
'file1.file'
>>> src
'file1.file'
>>> dst == src
True
If you are dealing with subdirectories you would need to know the base src and dst directories and then remove them from the beginning of each path:
>>> src = '/root/src'
>>> dst = '/root/dst'
>>> src_file = '/root/src/dir1/file1.file'
>>> dst_file = '/root/dst/dir1/file1.file'
>>> os.path.relpath(src_file, src)
'dir1/file1.file'
>>> os.path.relpath(dst_file, dst)
'dir1/file1.file'
>>> os.path.relpath(src_file, src) == os.path.relpath(dst_file, dst)
True
If you combine this with your function you get:
import os
src = '/root/src'
dst = '/root/dst'
for key, src_file in md5Srt.iteritems():
dst_file = md5Dst.get(key)
if dst_file is None:
print 'The destination is missing %s' src_file
continue
print "keys match " + key
print '\ncomparing the values of files\n'
if os.path.relpath(src_file, src) == os.path.relpath(dst_file, dst)
print srcFile
print dst_file
print "files match\n"
print "checking the next pair"
else:
print "values of files don't match"
I think you should rethink trying to compare files by finding a file in dst that has the same md5sum as a file in src. If a file is renamed or there are two files with the same hash you may end up with directories that are not exactly the same. A better approach would be to compare filenames first, and then check md5sums if there is a file that is in both src and dst.
Here is what that might look like:
import os
src_dir = '/root/src'
dst_dir = '/root/dst'
# reverse the dictionaries, hopefully you would create these dictionaries
# to begin with. A single file can only have one md5sum, but the same md5Sum can
# match multiple files
src_file_hashes = dict((os.path.relpath(v, src_dir), k) for k, v in md5Srt)
dst_file_hashes = dict((os.path.relpath(v, dst_dir), k) for k, v in md5Dst)
for src_file, src_hash in src_file_hashes.iteritems():
dst_hash = dst_file_hashes.get(src_file)
src_path = os.path.join(src_dir, src_file)
dst_path = os.path.join(dst_dir, dst_file)
if dst_hash is None:
print 'The destination file %s is missing ' % dst_path
continue
if src_hash == dst_hash:
print '%s matches %s and %s' % (src_hash, src_path, dst_path)
else:
print '%s and %s have different hashes' % (src_path, dst_path)

Related

Find a directory containing desired text and stored all that files into another folder

How can we identify files of desired text in a file of the same directory and then separate all those files into another directory?
import os
user_input = input('What is the name of your directory')
directory = os.listdir(user_input)
searching = input('What word are you trying to find?')
for fname in directory:
if os.path.isfile(user_input + os.sep + fname):
# Full path
f = open(user_input + os.sep + fname, 'r')
if searching in f.read():
print('found string in file %s' % fname)
else:
print('string not found')
f.close()
shutil has many methods you can use. One of which is:
import shutil
shutil.copyfile(src, dst)
# 2nd option
shutil.copy(src, dst)
You can copy those file into another location.
by adding
if searching in f.read():
print('found string in file %s' % fname)
shutil.copy(user_input + os.sep + fname,destination)

safe copy file python program - what's problem in code

I'm trying to copy the files from one destination to another and i follow this program and i don't know what i did the mistake but files are not copying to destination folder.
https://gist.github.com/alexwlchan/c2adbb8ee782f460e5ec
I don't know about too much about programming i just follow the tutorial.
I added extra in this code
src = ("F:\\Work\\")
dst = ("F:\\ws\\")
So please correct me if i'm wrong.
Thanks in advance !
import filecmp
import os
import shutil
src = ("F:\\Work\\")
dst = ("F:\\ws\\")
def _increment_filename(filename, marker='-'):
basename, fileext = os.path.splitext(filename)
if marker not in basename:
base = basename
value = 0
else:
base, counter = basename.rsplit(marker, 1)
try:
value = int(counter)
except ValueError:
base = basename
value = 0
while True:
if value == 0:
value += 1
yield filename
value += 1
yield '%s%s%d%s' % (base, marker, value, fileext)
def copyfile(src, dst):
if not os.path.exists(src):
raise ValueError('Source file does not exist: {}'.format(src))
if not os.path.exists(os.path.dirname(dst)):
os.makedirs(os.path.dirname(dst))
while True:
dst_gen = _increment_filename(dst)
dst = next(dst_gen)
if os.path.exists(dst):
if filecmp.cmp(src, dst):
return dst
else:
try:
src_fd = os.open(src, os.O_RDONLY)
dst_fd = os.open(dst, os.O_WRONLY|os.O_EXCL|os.O_CREAT|os.O_EXLOCK)
# Read 100 bytes at a time, and copy them from src to dst
while True:
data = os.read(src_fd, 100)
os.write(dst_fd, data)
# When there are no more bytes to read from the source
# file, 'data' will be an empty string
if not data:
break
# If we get to this point, then the write has succeeded
return dst
except OSError as e:
if e.errno != 17 or e.strerror != 'File exists':
raise
else:
print('Race condition: %s just popped into existence' % dst)
finally:
os.close(src_fd)
os.close(dst_fd)
# Copying to this destination path has been unsuccessful, so increment
# the path and try again
dst = next(dst_gen)
def move(src, dst):
dst = copyfile(src, dst)
os.remove(src)
return dst
There is no error in program, program run fine but the destination folder is blank.
Expected result should be files copy to destination folder & below expected resulted according to program
If a file already exists at dst, it will not be overwritten, but:
* If it is the same as the source file, do nothing
* If it is different to the source file, pick a new name for the copy that
is distinct and unused, then copy the file there.
I'm sorry, but that code seems to be massively overcomplicated for what it is. this should work in almost all cases. If dst already exists, it will add underscores ( _ ) to the directory name until an unused one is found:
import os
import shutil
import filecmp
src = ("D:\\Documents\\oof")
dst = ("D:\\oof")
validextensions = ["jpeg", "txt", "pdf", "pptx"]
def copydir(src, dst):
if not os.path.isdir(src):
print("Source directory doesn't exist.")
return None
if not os.path.exists(dst):
os.mkdir(dst)
elif not os.path.isdir(dst):
while not os.path.isdir(dst):
dst += "_"
os.mkdir(dst)
for file in os.listdir(src):
frompath = os.path.join(src, file)
topath = os.path.join(dst, file)
if os.path.isfile(frompath):
complete = False
if not any([file[-1 * len(ext):] == ext for ext in validextensions]):
complete = True
while not complete:
if os.path.isfile(topath):
if filecmp.cmp(frompath, topath):
complete = True
else:
topath = topath[:topath.index(".")] + "_" + topath[topath.index("."):]
else:
shutil.copyfile(frompath, topath)
complete = True
elif os.path.isdir(frompath):
copydir(frompath, topath)
copydir(src, dst)
I'm loving how this is progressively becoming more complex as OP lists more features they wanted facepalm

Python: can't create subdirectory

I want to apply a test to list of files. The files that past the test should be moved to the directory "Pass"; the others should be moved to the directory "Fail".
Thus the output directory should contain subdirectories "Pass" and "Fail".
Here is my attempt:
if(<scan==pass>) # Working fine up to this point
dest_dir = outDir + '\\' + 'Pass' # The problem is here
print("Pass", xmlfile)
MoveFileToDirectory(inDir, xmlfile, dest_dir)
else:
dest_dir = os.path.dirname(outDir + '\\' + 'Fail')
print("Fail: ", xmlfile)
MoveFileToDirectory(inDir, xmlfile, dest_dir)
However, my code is moving the files to the output directory and not creating the "Pass" or "Fail" subdirectories. Any ideas why?
Use os.path.join(). Example:
os.path.join(outDir, 'Pass')
See this SO post
Also, we don't know what MoveFileToDirectory does. Use the standard os.rename:
os.rename("path/to/current/file.foo", "path/to/new/desination/for/file.foo")
See this SO post
So:
source_file = os.path.join(inDir, xmlfile)
if(conditionTrue):
dest_file = os.path.join(outDir, 'Pass', xmlfile)
print("Pass: ", xmlfile)
else:
dest_file = os.path.join(outDir, 'File', xmlfile)
print("Fail: ", xmlfile)
os.rename(source_file, dest_file)
Create directories exactly once:
import os
labels = 'Fail', 'Pass'
dirs = [os.path.join(out_dir, label) for label in labels]
for d in dirs:
try:
os.makedirs(d)
except EnvironmentError:
pass # ignore errors
Then you could move files into the created directories:
import shutil
print("%s: %s" % (labels[condition_true], xmlfile))
shutil.move(os.path.join(out_dir, xmlfile), dirs[condition_true])
The code exploits that False == 0 and True == 1 in Python.

Finding md5 of files recursively in directory in python

I want to find the md5sum of files starting with "10" ( could be exe, doc, pdf etc) hence not checking the file extension but only the start two digits. So far I've a script to traverse through the directory and print out all such files but couldn't get the checksum to be printed for each of them:
def print_files(file_directory, file_extensions=['10']):
''' Print files in file_directory with extensions in file_extensions, recursively. '''
# Get the absolute path of the file_directory parameter
file_directory = os.path.abspath(file_directory)
# Get a list of files in file_directory
file_directory_files = os.listdir(file_directory)
# Traverse through all files
for filename in file_directory_files:
filepath = os.path.join(file_directory, filename)
# Check if it's a normal file or directory
if os.path.isfile(filepath):
# Check if the file has an extension of typical video files
for file_extension in file_extensions:
# Not a reqd file, ignore
#if not filepath.endswith(file_extension):
if not filename.startswith(file_extension) or len(filename) != 19:
continue
# We have got a '10' file!
print_files.counter += 1
## TRYING TO READ AND PRINT MD5 USING HASHLIB/ DOESNT WORK###
hasher = hashlib.md5()
with open(filename, 'rb') as afile:
buf = afile.read(65536)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(65536)
# Print it's name
print('{0}'.format(filepath))
print hasher('{0}.format(filepath)').hexdigest()
print '\n'
elif os.path.isdir(filepath):
# We got a directory, enter into it for further processing
print_files(filepath)
if __name__ == '__main__':
# Directory argument supplied
if len(sys.argv) == 2:
if os.path.isdir(sys.argv[1]):
file_directory = sys.argv[1]
else:
print('ERROR: "{0}" is not a directory.'.format(sys.argv[1]))
exit(1)
else:
# Set file directory to CWD
file_directory = os.getcwd()
print('\n -- Looking for Required Files in "{0}" -- \n'.format(file_directory))
# Set the number of processed files equal to zero
print_files.counter = 0
# Start Processing
print_files(file_directory)
# We are done. Exit now.
'
I'd recommend that you do not solve this recursively, but instead make use of os.walk() to traverse the directory structure. The following code could be the body of your print_files function.
file_directory = os.path.abspath(file_directory)
paths_to_hash = []
for root, dirs, filenames in os.walk(file_directory, topdown=False):
for i, dir in enumerate(dirs):
for filename in filenames[i]:
if filenames[:2] == '10':
paths_to_hash += [os.path.abspath('{0}/{1}/{2}'.format(root, dir, filename)]
for path in paths_to_hash:
hash = hashlib.md5(open(path, 'rb').read()).digest())
print 'hash: {0} for path: {1}'.format(hash, path)
The line printing the hasher should be:
print('{0}'.format(hasher.hexdigest()))
Got it fixed with this line
print hashlib.md5(open('{0}'.format(filepath)).read()).hexdigest()
I wasnt reading the file but just passing hashlib.md5. Thanks Matt for the insight.

Copy Files From List Python

Im making a small python program to copy some files. My filenames are in a list "selectedList".
The user has selected the source dir "self.DirFilename" and the destination dir "self.DirDest".
I'm using cp instead of shutil because I've read that shutil is slow.
Heres my code:
for i in selectedList:
src_dir = self.DirFilename + "/" + str(i) + ".mov"
dst_dir = self.DirDest
r = os.system('cp -fr %s %s' % (src_dir, dst_dir))
if r != 0:
print 'An error occurred!'**
I would like the copy to search the source directory for the given filename and then recreate the folder structure in the destination as well as copy the file.
Any suggestions would be helpful (like any massively obvious mistakes that i'm making)- its my first python programme and I'm nearly there!
Thanks
Gavin
I think something like this could do the trick. Of course you may want to use something ore advance that os.system to call cp.
import os
for r, d, f in os.walk(self.DirFilename):
for file in f:
f_name, f_ext = os.path.splitext(file)
if ".mov" == f_ext:
if f_name in selectedList:
src_abs_path = os.path.join(r, file)
src_relative_path = os.path.relpath(src_abs_path, self.DirFilename)
dst_abs_path = os.path.join(self.DirDest, src_relative_path)
dst_dir = os.path.dirname(dst_abs_path)
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
ret = os.system('cp -fr %s %s' % (src_abs_path, dst_abs_path))
if ret != 0:
print 'An error occurred!'
See http://blogs.blumetech.com/blumetechs-tech-blog/2011/05/faster-python-file-copy.html for a pure Python implementation of the recursive copy.
You can use os.walk to find the file you need:
def find_files(...):
for ... in os.walk(...):
if ...:
yield filename
for name in find_files(...):
copy(name, ...)
import glob
for fname in selectedList:
filename = str(fname) + '.mov'
found = glob.glob(os.path.join(self.DirFilename, filename))
found.extend(glob.glob(os.path.join(self.DirFilename, '**', filename)))
found = [(p, os.path.join(self.DirDest, os.path.relpath(p, self.DirFilename))) for p in found]
for found_file in found:
# copy files however
#r = os.system('cp -fr %s %s' % found_file)

Categories