How to copy only non-duplicate files whilst maintaining folder structure? - python

I am trying to find duplicates between two folders and copy only unique image files to the 'dest' folder. I can copy all the non-dupes using the code below, however it doesn't maintain the source directory structure. I think OS.walk returns 3 tuples, but they aren't linked so not sure how to re-construct the sub dir?
Example:
import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup \ destination folder:')
ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')
search = dif(source, dest)
result = search.result
result
dupes = []
srcfiles = []
filecount = []
failed = []
removed = []
for i in result.values():
dupes.append(i['location'])
for dirpath, subdirs, files in os.walk(source):
for x in files:
if x.endswith(ext):
srcfiles.append(os.path.join(dirpath, x))
for f in srcfiles:
if f not in dupes:
shutil.copy(f, dest)
print('File copied successfully - '+f)
filecount.append(f)
else:
print('File not copied successfully !!!! - '+f)
failed.append(f)
I have also tried using the shutil.copytree function with an ignore list, however it requires a new folder and can't get the ignore list function to work
shutil.copytree example:
for i in result.values():
df = []
df.append(i['filename'])
def ignorelist(source, df):
return [f for f in df if os.path.isfile(os.path.join(source, f))]
shutil.copytree(source, destnew, ignore=ignorelist)

This function ignorelist should do the trick:
import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup \ destination folder:')
ext = ('.jpg','.jpeg','.gif')
search = dif(source, dest)
dupes = [value['location'] for value in search.result.values()]
def ignorelist(source, files):
return [file for file in files
if (os.path.isfile(os.path.join(source, file))
and (os.path.join(source, file) in dupes
or not file.lower().endswith(ext)))]
shutil.copytree(source, dest, ignore=ignorelist)
And the other "more manual" way would be
import shutil, os
from difPy import dif
source = input('Input source folder:').rstrip('/\\')
dest = input('Input backup \ destination folder:').rstrip('/\\')
ext = ('.jpg','.jpeg','.gif')
search = dif(source, dest)
dupes = [value['location'] for value in search.result.values()]
srcfiles = []
copied = []
failed = []
skipped = []
for dirpath, subdirs, files in os.walk(source):
for file in files:
if file.lower().endswith(ext):
srcfile = os.path.join(dirpath,file)
srcfiles.append(srcfile)
if srcfile in dupes:
print('File not copied (duplicate) - '+srcfile)
skipped.append(srcfile)
else:
try:
destfile = os.path.join(dest,srcfile[len(source)+1:])
os.makedirs(os.path.dirname(destfile), exist_ok=True)
shutil.copy(srcfile,destfile)
print('File copied successfully - '+srcfile)
copied.append(srcfile)
except Exception as err:
print('File not copied (error %s) - %s' % (str(err),srcfile))
failed.append(f)

I have changed some variable names to make them more descriptive. And what you call failed is really just a list of files that are not copied because they are duplicates rather than files whose copying was attempted but failed.
import shutil, os
from difPy import dif
source = input('Input source folder: ')
dest = input('Input backup \ destination folder: ')
# Remove trailing path separators if they exist:
if source.endswith(('/', '\\')):
source = source[:-1]
if dest.endswith(('/', '\\')):
dest = dest[:-1]
# Use the correct path separator to
# ensure correct matching with dif results:
if os.sep == '/':
source = source.replace('\\', os.sep)
elif os.sep == '\\':
source = source.replace('/', os.sep)
source_directory_length = len(source) + 1
ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')
search = dif(source, dest)
result = search.result
# Set comprehension:
dupes = {duplicate['location'] for duplicate in result.values()}
copied = []
not_copied = []
for dirpath, subdirs, files in os.walk(source):
for file in files:
if file.endswith(ext):
source_path = os.path.join(dirpath, file)
if source_path not in dupes:
# get subdirectory of source directory that this file is in:
file_length = len(file) + 1
# Get subdirectory relative to the source directory:
subdirectory = source_path[source_directory_length:-file_length]
if subdirectory:
dest_directory = os.path.join(dest, subdirectory)
# ensure directory exists:
os.makedirs(dest_directory, exist_ok=True)
else:
dest_directory = dest
dest_path = os.path.join(dest_directory, file)
shutil.copy(source_path, dest_path)
print('File copied successfully -', source_path)
copied.append(source_path)
else:
print('File not copied -', source_path)
not_copied.append(source_path)

Related

how to merge folders in python?

How to remove part of a tree but keep the files and directories in python?
I have paths like this:
r"C:\User\Desktop\g1sr56g41f2d3s1gf\Document\A\file1.txt"
r"C:\User\Desktop\g1sr56g41f2d3s1gf\Document\B\C\file2.txt"
r"C:\User\Desktop\g1sr56g41f2d3s1gf\file3.txt"
r"C:\User\Desktop\F2F31DS5FDSF1S2F3DS2F1D23\file4.txt"
r"C:\User\Desktop\g1sr56g41f2d3s1gf\Document\B\C\file5.txt"
r"C:\User\Desktop\g1sr56g41f2d3s1gf\Document\D\E\file6.txt"
I want to move them to:
r"C:\User\Desktop\Document\A\file1.txt"
r"C:\User\Desktop\Document\B\C\file2.txt"
r"C:\User\Desktop\file3.txt"
r"C:\User\Desktop\file4.txt"
r"C:\User\Desktop\Document\B\C\file5.txt"
r"C:\User\Desktop\Document\D\E\file6.txt"
SOme simply dirty way to do it
import os
paths = r"C:\User\Desktop\g1sr56g41f2d3s1gf\Document\D\E\file6.txt"
path = paths.replace(os.sep, '/')
l=(list(path.split("/")))
trim = l[3]
print(trim)
final_path = path.replace("/"+trim,'')
final_path = final_path.replace('/', os.sep)
print(final_path)
output
C:\User\Desktop\Document\D\E\file6.txt
Solution 2
import os
import re
paths = r"C:\User\Desktop\g1sr56g41f2d3s1gf\Document\D\E\file6.txt"
path = paths.replace(os.sep, '/')
l=(list(path.split("/")))
del l[3]
final_path = os.sep.join(l)
print(final_path)
output
C:\User\Desktop\Document\D\E\file6.txt
Here is my code:
#!/usr/bin/python3
import os, shutil
DST = 'Desktop'
toDel = []
for folder_name in os.listdir(DST):
folder = os.path.join(DST, folder_name)
if not os.path.isdir(folder):
continue
for path, _, files in os.walk(folder):
relpath = os.path.join(DST, os.path.relpath(path, folder))
for file in files:
search = os.path.join(path, file)
destination = os.path.join(relpath, file)
if not os.path.exists(relpath):
os.mkdir(relpath)
os.replace(search, destination)
if len(toDel) == 0 or not path.startswith(toDel[-1] + os.sep):
toDel.append(path)
for folder in toDel:
shutil.rmtree(folder)

os.rename(source, destination) The filename, directory name, or volume label syntax is incorrect

import os
f = open("Names.txt", "r")
names = f.readlines()
folder = r'C:\Users\e007l\Desktop\Rename\\'
count = 1
for file_name in os.listdir(folder):
source = folder + file_name
destination = folder + names[int(count)] + ".txt"
os.rename(source, destination)
count += 1
res = os.listdir(folder)
print(res)
print(folder)
This is should change the names of the files in the folder to the names in my list but it won't it simply gives me the error message:
[WinError 123] The filename, directory name, or volume label syntax is incorrect: 'C:\Path\names1.txt' -> 'C:\Path\Beta\n.txt'
My text file Names.txt has this inside:
Alpha Beta Delta Omega
Those are the names I want to Give to the existing files
There are 3 errors:
The names are not loaded currectly;
The paths of the files are not join correctly;
You start your count at 1, so it throws an Index error.
Here is a code that seems to work in the way you want.
import os
f = open("Names.txt", "r")
names = f.readline().split(' ')
print(f"{names = }")
folder = r'.\rename'
for i, file_name in enumerate(os.listdir(folder)):
source = os.path.join(folder, file_name)
destination = os.path.join(folder, names[i]) + ".txt"
os.rename(source, destination)
res = os.listdir(folder)
print(res)
print(folder)
Output:
names = ['Alpha', 'Beta', 'Delta', 'Omega']
['Alpha.txt', 'Beta.txt', 'Delta.txt']
.\rename

How to prevent shutil.move from overwriting a file if it already exists?

I'm using this Python code in Windows:
shutil.move(documents_dir + "\\" + file_name, documents_dir + "\\backup\\"
+ subdir_name + "\\" + file_name)
When this code is called more times, it overwrites the destination file. I would like to move the file
and if the destination already exists, to rename it
e.g. file_name = foo.pdf
and in backup folder will be foo.pdf, foo(1).pdf, foo(2).pdf etc. or similarly e.g. with dashes
foo-1.pdf, foo-2.pdf etc.
You could just check with os.path.exists() as you're going.
import os
import shutil
file_name = 'test.csv'
documents_dir = r'C:\BR\Test'
subdir_name = 'test'
# using os.path.join() makes your code easier to port to another OS
source = os.path.join(documents_dir, file_name)
dest = os.path.join(documents_dir, 'backup', subdir_name, file_name)
num = 0
# loop until we find a file that doesn't exist
while os.path.exists(dest):
num += 1
# use rfind to find your file extension if there is one
period = file_name.rfind('.')
# this ensures that it will work with files without extensions
if period == -1:
period = len(file_name)
# create our new destination
# we could extract the number and increment it
# but this allows us to fill in the gaps if there are any
# it has the added benefit of avoiding errors
# in file names like this "test(sometext).pdf"
new_file = f'{file_name[:period]}({num}){file_name[period:]}'
dest = os.path.join(documents_dir, 'backup', subdir_name, new_file)
shutil.move(source, dest)
Or since this is probably used in a loop you could just drop it into a function.
import os
import shutil
def get_next_file(file_name, dest_dir):
dest = os.path.join(dest_dir, file_name)
num = 0
while os.path.exists(dest):
num += 1
period = file_name.rfind('.')
if period == -1:
period = len(file_name)
new_file = f'{file_name[:period]}({num}){file_name[period:]}'
dest = os.path.join(dest_dir, new_file)
return dest
file_name = 'test.csv'
documents_dir = r'C:\BR\Test'
subdir_name = 'test'
source = os.path.join(documents_dir, file_name)
dest = get_next_file(file_name, os.path.join(documents_dir, 'backup', subdir_name))
shutil.move(source, dest)

Autoincrement file names

This issue comes from [here]. I tried asking about this is the link provided but I was downvoted and told to ask my own question...so here I am.
I tried replicating the results for my own project and it didn't work. when I try to save more than two files the script starts renaming each file instead of just the new ones I create:
file_1_2_2_1_4_4_6_2_2.pdf
file1_3_2_3_3-6_5_1.pdf
file2_1_1_1-7_3_9.pdf
etc
instead of
file_1.pdf
file_2.pdf
file_3.pdf
etc.
Any suggestions?
def save_file():
path = "/home/PycharmProjects/untitled/screening/"
newPath = "/home/PycharmProjects/untitled/screening/finished"
i = 1
for root, dirs, files in os.walk(path):
for name in files:
base, extension = os.path.splitext(name)
if not os.path.exists(os.path.join(newPath, base + extension)):
oldfile = os.path.join(os.path.abspath(root), name)
newfile = os.path.join(newPath, base + extension)
os.rename(oldfile, newfile)
else:
oldfile = os.path.join(os.path.abspath(root), name)
newfile = os.path.join(newPath, base + '_' + str(i) + extension)
i += 1
os.rename(oldfile, newfile)
Thank you in advance for you help!
The reason you get this behavior is that os.walk recurses into subdirs. Your target dir IS a subdir of your sourcedir - so you rename files from source to target and later os.walk into the target directory and rename some more into itself using the "renaming" strategy all the time because the file already exists.
Lenghty solution - most of it is creating file structures so this is a Minimal, Complete, and Verifiable example you can use.
See the documentation of topdown=False in os.walk
Create file structure
import os
files = [ f"file_{i:05}x.txt" for i in range(20)]
org = os.path.abspath("./dir1/dir2/")
new = os.path.abspath("./dir1/dir2/new/")
os.makedirs(new)
# create all in org
for f in files:
with open(os.path.join(org,f),"w") as f:
f.write(" ")
#create every 4th one in new
for f in files[::4]:
with open(os.path.join(new,f),"w") as f:
f.write(" ")
for root,dirs,files in os.walk(org):
print(root)
print(" [d] ", dirs)
print(" [f] ", sorted(files))
Output:
/tmp/dir1/dir2
[d] ['new']
[f] ['file_00000x.txt', 'file_00001x.txt', 'file_00002x.txt', 'file_00003x.txt',
'file_00004x.txt', 'file_00005x.txt', 'file_00006x.txt', 'file_00007x.txt',
'file_00008x.txt', 'file_00009x.txt', 'file_00010x.txt', 'file_00011x.txt',
'file_00012x.txt', 'file_00013x.txt', 'file_00014x.txt', 'file_00015x.txt',
'file_00016x.txt', 'file_00017x.txt', 'file_00018x.txt', 'file_00019x.txt']
/tmp/dir1/dir2/new
[d] []
[f] ['file_00000x.txt', 'file_00004x.txt', 'file_00008x.txt', 'file_00012x.txt',
'file_00016x.txt']
Fixed method
def save_file(old_path, new_path):
# topdown = False allows to modify the results to NOT recurse
for root, dirs, files in os.walk(old_path, topdown=False):
dirs = [] # do not recurse into subdirs ( whereto we copy the stuff )
root_abs = os.path.abspath(root)
new_abs = os.path.abspath(new_path)
for name in sorted(files): # sorting is convenience, not needed
old_file = os.path.join(root_abs, name)
new_file = os.path.join(new_abs, name)
# fix renaming logic (simplified) - looks until a unique name is found
i = 1
base, extension = os.path.splitext(name)
while os.path.exists(new_file):
# create a new name if it already exists
new_file = os.path.join(new_abs, f"{base}_{i}{extension}")
i += 1
# do the copy over
os.rename(old_file, new_file)
Usage:
# uses the org/new from above
# org = os.path.abspath("./dir1/dir2/")
# new = os.path.abspath("./dir1/dir2/new/")
save_file(org,new)
for root,dirs,files in os.walk(org):
print(root)
print(" [d] ", dirs)
print(" [f] ", sorted(files))
Output afterwards:
/tmp/dir1/dir2
[d] ['new']
[f] []
/tmp/dir1/dir2/new
[d] []
[f] ['file_00000x.txt', 'file_00000x_1.txt', 'file_00001x.txt', 'file_00002x.txt',
'file_00003x.txt', 'file_00004x.txt', 'file_00004x_1.txt', 'file_00005x.txt',
'file_00006x.txt', 'file_00007x.txt', 'file_00008x.txt', 'file_00008x_1.txt',
'file_00009x.txt', 'file_00010x.txt', 'file_00011x.txt', 'file_00012x.txt',
'file_00012x_1.txt', 'file_00013x.txt', 'file_00014x.txt', 'file_00015x.txt',
'file_00016x.txt', 'file_00016x_1.txt', 'file_00017x.txt', 'file_00018x.txt',
'file_00019x.txt']
You see some files in new got the _1 infix in it's name due to a same-named file was already in it.

Going into subfolders (python)

I've written something to remove special characters in Filenames. But it just includes the one folder and not it's subfolders. How can I do this also in subfolders and subsubfolders and so on?
import os
import re
def dir_list2(directory, *args):
fileList = []
content = os.listdir(directory)
for file in content :
dirfile = os.path.join(directory, file)
if os.path.isfile(dirfile):
if len(args) == 0:
fileList.append(dirfile)
else:
if os.path.splitext(dirfile)[1][1:] in args:
fileList.append(dirfile)
print "##################################################"
print "Old filename:", file
filename = file
remove = re.compile("[^.a-zA-z0-9_]")
output = remove.sub('_', filename)
newfile = directory + "/" + output
os.rename(dirfile, newfile)
print "Corrected filename:", output
#Removes Special Characters
return fileList
if __name__ == '__main__':
fileList = dir_list2('/path/')
Try using os.walk instead of os.listdir, it allows you to walk through a folder and its files and subfolders and so on.
Edit your code to be like:
content = os.walk(directory)
for dirpath, dirnames, filenames in content:
for file in filenames:
dirfile = os.path.join(dirpath, file)
# The rest of your code

Categories