"Delete files based on Invoice number, date and verison number" - python

I want to delete multiple files from certain directory based on file name, date and version number using Python. PS File creation date cant be taken into account
Refereed post of Stackoverflow but file names and version numbers are changed. How to keep date and version number to find latest file.
source = r'C:\Users\XMLFiles'
file_names = os.listdir(source)
latest_files = {}
for file_name in file_names:
name_parts = file_name.split("_")
date_stamp = name_parts[2], name_parts[3].split(".")[0]
if date_stamp not in latest_files or file_name > latest_files[date_stamp]:
latest_files[date_stamp] = file_name
print(latest_files)
keep_files = latest_files.values()
for file_name in file_names:
if file_name in keep_files:
continue
os.remove(os.path.join(source, file_name)
##################
List of files to process
=============================
Invoice_456879_20180404_2510.xml
Invoice_123876_20171027_17.xml
Invoice_123876_20180404_2513.xml
Invoice_832765_20170309_2.xml
Invoice_832765_20170313_0.xml
Invoice_832765_20170323_5.xml
Invoice_832765_20170330_2.xml
Invoice_832765_20170613_3.xml
Invoice_832765_20171206_18.xml
Invoice_832765_20171206_30.xml
Invoice_832765_20171206_36.xml
Invoice_832765_20180404_3066.xml
Invoice_832765_20180405_9770.xml
Invoice_832765_20180405_9779.xml
Invoice_698325_20170308_0.xml
Invoice_698325_20170309_3.xml
Invoice_698325_20170323_4.xml
Invoice_698325_20170330_5.xml
Invoice_698325_20170613_4.xml
Invoice_698325_20171206_8.xml
Invoice_698325_20171206_24.xml
Invoice_698325_20171206_46.xml
Invoice_698325_20180404_3067.xml
Invoice_698325_20180405_9771.xml
===========================================================
Expected Output
Invoice_456879_20180404_2510.xml
Invoice_123876_20180404_2513.xml
Invoice_832765_20180405_9779.xml
Invoice_698325_20180405_9771.xml

latest_files = {}
for file_name in file_names:
name_parts = file_name.split('_')
invoice = name_parts[1]
ymd = name_parts[2]
version = int(name_parts[3].split('.')[0])
if invoice not in latest_files:
latest_files[invoice] = (ymd, version, file_name)
continue
latest_ymd, latest_version = latest_files[invoice][:2]
if ymd > latest_ymd or (ymd == latest_ymd and version > latest_version):
latest_files[invoice] = (ymd, version, file_name)
keep_files = [tup[2] for tup in latest_files.values()]
for file_name in file_names:
if file_name in keep_files:
continue
os.remove(os.path.join(source, file_name))

Related

How to copy only non-duplicate files whilst maintaining folder structure?

I am trying to find duplicates between two folders and copy only unique image files to the 'dest' folder. I can copy all the non-dupes using the code below, however it doesn't maintain the source directory structure. I think OS.walk returns 3 tuples, but they aren't linked so not sure how to re-construct the sub dir?
Example:
import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup \ destination folder:')
ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')
search = dif(source, dest)
result = search.result
result
dupes = []
srcfiles = []
filecount = []
failed = []
removed = []
for i in result.values():
dupes.append(i['location'])
for dirpath, subdirs, files in os.walk(source):
for x in files:
if x.endswith(ext):
srcfiles.append(os.path.join(dirpath, x))
for f in srcfiles:
if f not in dupes:
shutil.copy(f, dest)
print('File copied successfully - '+f)
filecount.append(f)
else:
print('File not copied successfully !!!! - '+f)
failed.append(f)
I have also tried using the shutil.copytree function with an ignore list, however it requires a new folder and can't get the ignore list function to work
shutil.copytree example:
for i in result.values():
df = []
df.append(i['filename'])
def ignorelist(source, df):
return [f for f in df if os.path.isfile(os.path.join(source, f))]
shutil.copytree(source, destnew, ignore=ignorelist)
This function ignorelist should do the trick:
import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup \ destination folder:')
ext = ('.jpg','.jpeg','.gif')
search = dif(source, dest)
dupes = [value['location'] for value in search.result.values()]
def ignorelist(source, files):
return [file for file in files
if (os.path.isfile(os.path.join(source, file))
and (os.path.join(source, file) in dupes
or not file.lower().endswith(ext)))]
shutil.copytree(source, dest, ignore=ignorelist)
And the other "more manual" way would be
import shutil, os
from difPy import dif
source = input('Input source folder:').rstrip('/\\')
dest = input('Input backup \ destination folder:').rstrip('/\\')
ext = ('.jpg','.jpeg','.gif')
search = dif(source, dest)
dupes = [value['location'] for value in search.result.values()]
srcfiles = []
copied = []
failed = []
skipped = []
for dirpath, subdirs, files in os.walk(source):
for file in files:
if file.lower().endswith(ext):
srcfile = os.path.join(dirpath,file)
srcfiles.append(srcfile)
if srcfile in dupes:
print('File not copied (duplicate) - '+srcfile)
skipped.append(srcfile)
else:
try:
destfile = os.path.join(dest,srcfile[len(source)+1:])
os.makedirs(os.path.dirname(destfile), exist_ok=True)
shutil.copy(srcfile,destfile)
print('File copied successfully - '+srcfile)
copied.append(srcfile)
except Exception as err:
print('File not copied (error %s) - %s' % (str(err),srcfile))
failed.append(f)
I have changed some variable names to make them more descriptive. And what you call failed is really just a list of files that are not copied because they are duplicates rather than files whose copying was attempted but failed.
import shutil, os
from difPy import dif
source = input('Input source folder: ')
dest = input('Input backup \ destination folder: ')
# Remove trailing path separators if they exist:
if source.endswith(('/', '\\')):
source = source[:-1]
if dest.endswith(('/', '\\')):
dest = dest[:-1]
# Use the correct path separator to
# ensure correct matching with dif results:
if os.sep == '/':
source = source.replace('\\', os.sep)
elif os.sep == '\\':
source = source.replace('/', os.sep)
source_directory_length = len(source) + 1
ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')
search = dif(source, dest)
result = search.result
# Set comprehension:
dupes = {duplicate['location'] for duplicate in result.values()}
copied = []
not_copied = []
for dirpath, subdirs, files in os.walk(source):
for file in files:
if file.endswith(ext):
source_path = os.path.join(dirpath, file)
if source_path not in dupes:
# get subdirectory of source directory that this file is in:
file_length = len(file) + 1
# Get subdirectory relative to the source directory:
subdirectory = source_path[source_directory_length:-file_length]
if subdirectory:
dest_directory = os.path.join(dest, subdirectory)
# ensure directory exists:
os.makedirs(dest_directory, exist_ok=True)
else:
dest_directory = dest
dest_path = os.path.join(dest_directory, file)
shutil.copy(source_path, dest_path)
print('File copied successfully -', source_path)
copied.append(source_path)
else:
print('File not copied -', source_path)
not_copied.append(source_path)

How to prevent shutil.move from overwriting a file if it already exists?

I'm using this Python code in Windows:
shutil.move(documents_dir + "\\" + file_name, documents_dir + "\\backup\\"
+ subdir_name + "\\" + file_name)
When this code is called more times, it overwrites the destination file. I would like to move the file
and if the destination already exists, to rename it
e.g. file_name = foo.pdf
and in backup folder will be foo.pdf, foo(1).pdf, foo(2).pdf etc. or similarly e.g. with dashes
foo-1.pdf, foo-2.pdf etc.
You could just check with os.path.exists() as you're going.
import os
import shutil
file_name = 'test.csv'
documents_dir = r'C:\BR\Test'
subdir_name = 'test'
# using os.path.join() makes your code easier to port to another OS
source = os.path.join(documents_dir, file_name)
dest = os.path.join(documents_dir, 'backup', subdir_name, file_name)
num = 0
# loop until we find a file that doesn't exist
while os.path.exists(dest):
num += 1
# use rfind to find your file extension if there is one
period = file_name.rfind('.')
# this ensures that it will work with files without extensions
if period == -1:
period = len(file_name)
# create our new destination
# we could extract the number and increment it
# but this allows us to fill in the gaps if there are any
# it has the added benefit of avoiding errors
# in file names like this "test(sometext).pdf"
new_file = f'{file_name[:period]}({num}){file_name[period:]}'
dest = os.path.join(documents_dir, 'backup', subdir_name, new_file)
shutil.move(source, dest)
Or since this is probably used in a loop you could just drop it into a function.
import os
import shutil
def get_next_file(file_name, dest_dir):
dest = os.path.join(dest_dir, file_name)
num = 0
while os.path.exists(dest):
num += 1
period = file_name.rfind('.')
if period == -1:
period = len(file_name)
new_file = f'{file_name[:period]}({num}){file_name[period:]}'
dest = os.path.join(dest_dir, new_file)
return dest
file_name = 'test.csv'
documents_dir = r'C:\BR\Test'
subdir_name = 'test'
source = os.path.join(documents_dir, file_name)
dest = get_next_file(file_name, os.path.join(documents_dir, 'backup', subdir_name))
shutil.move(source, dest)

Create folders based on filenames

I have a folder with some 1500 excel files . The format of each file is something like this:
0d20170101abcd.xlsx
1d20170101ef.xlsx
0d20170104g.xlsx
0d20170109hijkl.xlsx
1d20170109mno.xlsx
0d20170110pqr.xlsx
The first character of the file name is either '0' or '1' followed by 'd' followed by the date when the file was created followed by customer id(abcd,ef,g,hijkl,mno,pqr).The customer id has no fixed length and it can vary.
I want to create folders for each unique date(folder name should be date) and move the files with the same date into a single folder .
So for the above example , 4 folders (20170101,20170104,20170109,20170110) has to be created with files with same dates copied into their respective folders.
I want to know if there is any way to do this in python ? Sorry for not posting any sample code because I have no idea as to how to start.
Try this out:
import os
import re
root_path = 'test'
def main():
# Keep track of directories already created
created_dirs = []
# Go through all stuff in the directory
file_names = os.listdir(root_path)
for file_name in file_names:
process_file(file_name, created_dirs)
def process_file(file_name, created_dirs):
file_path = os.path.join(root_path, file_name)
# Check if it's not itself a directory - safe guard
if os.path.isfile(file_path):
file_date, user_id, file_ext = get_file_info(file_name)
# Check we could parse the infos of the file
if file_date is not None \
and user_id is not None \
and file_ext is not None:
# Make sure we haven't already created the directory
if file_date not in created_dirs:
create_dir(file_date)
created_dirs.append(file_date)
# Move the file and rename it
os.rename(
file_path,
os.path.join(root_path, file_date, '{}.{}'.format(user_id, file_ext)))
print file_date, user_id
def create_dir(dir_name):
dir_path = os.path.join(root_path, dir_name)
if not os.path.exists(dir_path) or not os.path.isdir(dir_path):
os.mkdir(dir_path)
def get_file_info(file_name):
match = re.search(r'[01]d(\d{8})([\w+-]+)\.(\w+)', file_name)
if match:
return match.group(1), match.group(2), match.group(3)
return None, None, None
if __name__ == '__main__':
main()
Note that depending on the names of your files, you might want to change (in the future) the regex I use, i.e. [01]d(\d{8})([\w+-]+) (you can play with it and see details about how to read it here)...
Check this code.
import os
files = list(x for x in os.listdir('.') if x.is_file())
for i in files:
d = i[2:10] #get data from filename
n = i[10:] #get new filename
if os.path.isdir(i[2:10]):
os.rename(os.getcwd()+i,os.getcwd()+d+"/"+i)
else:
os.mkdir(os.getcwd()+i)
os.rename(os.getcwd()+i,os.getcwd()+d+"/"+i)
Here's is the repl link
Try this out :
import os, shutil
filepath = "your_file_path"
files = list(x for x in os.listdir(filepath) if x.endswith(".xlsx"))
dates = list(set(x[2:10] for x in files))
for j in dates:
os.makedirs(filepath + j)
for i in files:
cid = i[10:]
for j in dates:
if j in i:
os.rename(filepath+i,cid)
shutil.copy2(filepath+cid, filepath+j)

Put all files with same name in a folder

I'm new to Python and I need a program that copies files from the same day into a new folder.
Example files:
20120807_first_day_pic.jpg
20120807_first_day_sheet.jpg
20120807_first_day_sheet2.jpg
20120907_second_day_pic.jpg
20120907_second_day_sheet.jpg
20120907_second_day_sheet2.jpg
This is what I have so far, but every file gets a folder and not the whole day.
import os, re, shutil
tfolder = 'D:/Testing/src/'
os.chdir(tfolder)
re_year19xxxxxx = re.compile('(19[0-9][0-9][0-9][0-9])')
re_year20xxxxxx = re.compile('(20[0-9][0-9][0-9][0-9])')
re_ed = re.compile('(ED[0-9])')
destPath = 'D:/Testing/Dest/'
def analyse_file_name(fname):
filePath, coords = os.path.split(fname) #the new folders will be named according to the first 4 characters of the original file name
coordsFolder = coords[:53]
coordsFname = coords[:53]
coordsExt = os.path.splitext(fname)
year = 'year' #create variable year
ed = 'ed' #create variable ed to store the edition number if necessary
bname = fname #the original file name
for re_year in (re_year19xxxxxx, re_year20xxxxxx):
rx = re_year.search(fname) #search for regex in the file name and store it in rx
if rx:
year = rx.group(1) #if the regex is found, store the year
bname.replace(year, ' ')
res = re_ed.search(fname)
if res:
ed = res.group(1)
bname.replace(ed, ' ')
os.chdir(destPath)
if year is 'year':
fname2 = os.path.join(destPath, coordsFolder) + '\\' + coordsFname + coordsExt[1]
else:
fname2 = os.path.join(destPath, coordsFolder,year,ed) + '\\' + coordsFname + coordsExt[1]
print('%s -> %s' % (fname, fname2)) #debug print
dirn, _ = os.path.split(fname2)
if not os.path.exists(dirn):
os.makedirs(dirn)
shutil.copy(fname, fname2)
for root, dirs, files in os.walk(tfolder):
for name in files:
fn = os.path.join(root, name)
analyse_file_name(fn)
If you just want to copy files that start with a known date string format, how about something like this?
def copyfile(filepath, target_dir):
p, filename = os.path.split(filepath)
# get date component of name
date_component = filename.split("_", 1)[0]
# try to parse out the date
try:
d = datetime.datetime.strptime(date_component, "%Y%m%d")
except ValueError:
print "Could not place: ", filename
return
target_date_dir = os.path.join(target_dir, str(d.year), str(d.month), str(d.day))
os.makedirs(target_date_dir)
shutil.copy(filepath, target_date_dir)
First, create a dict (a defaultdict was even more convenient here) that will gather the files for a date (it's good to use re, but given the names of your files using split was easier):
>>> import os
>>> import re
>>> pat = r'(\d+)(?:_\d+)?_(\w+?)[\._].*'
>>> from collections import defaultdict
>>> dict_date = defaultdict(lambda : defaultdict(list))
>>> for fil in os.listdir(path):
if os.path.isfile(os.path.join(path, fil)):
date, animal = re.match(pat, fil).groups()
dict_date[date][animal].append(fil)
>>> dict_date['20120807']
defaultdict(<type 'list'>, {'first': ['20120807_first_day_pic.jpg', '20120807_first_day_sheet.jpg', '20120807_first_day_sheet2.jpg']})
Then for each date, create a subfolder and copy the corresponding files there:
>>> from shutil import copyfile
>>> for date in dict_date:
for animal in dict_date[date]:
try:
os.makedirs(os.path.join(path, date, animal))
except os.error:
pass
for fil in dict_date[date][animal]:
copyfile(os.path.join(path, fil), os.path.join(path, date, animal, fil))
EDIT: took into account OP's new requirements, and Khalid's remark.
Regex day :)
What about trying to match the filename with
pattern=r'(?P<filedate>(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2}))\_(?P<bloodyrestofname>.*)'
Complete date, year, etc. may be retrieved from the respective named groups in the match.
import os, shutil
src_path = "D:\\Testing\\Src\\"
dest_path = "D:\\Testing\\Dest\\"
for file in os.listdir(src_path):
if not os.path.isdir(dest_path + file.split("-")[0]):
os.mkdir(dest_path + file.split("-")[0])
shutil.copy(src_path + file, dest_path + file.split("-")[0])

Write file to directory based on variable in Python

The script will generate multiple files using the year and id variable. These files need to be placed into a folder matching year and id. How do I write them to the correct folders?
file_root_name = row["file_root_name"]
year = row["year"]
id = row["id"]
path = year+'-'+id
try:
os.makedirs(path)
except:
pass
output = open(row['file_root_name']+'.smil', 'w')
output.write(prettify(doctype, root))
If I understand your question correctly, you want to do this:
import os.path
file_name = row['file_root_name']+'.smil'
full_path = os.path.join(path, file_name)
output = open(full_path, 'w')
Please note that it's not very common in Python to use the + operator for string concatenation. Although not in your case, with large strings the method is not very fast.
I'd prefer:
file_name = '%s.smil' % row['file_root_name']
and:
path = '%i-%i' % (year, id)

Categories