BadZipFile: File is not a zip file - python

This is my code. I get the error when I try to execute this script
Error raise BadZipFile("File is not a zip file")
BadZipFile: File is not a zip file
This is my source directorypath
data_dir = r'L:\DataQA\Python Unzip Files\Source Zipped'
I have multiple zipped folders within ‘Source Zipped’(uncompressed) folder. The same code works when I zip all the subfolder of source Zipped into single zipped folder. But I don’t want this approach.
import os
import zipfile
import shutil
import json
import logging
import logging.config
import time
def my_start_time():
global start_time, cumulative_time, start_time_stamp
start_time = time.time()
this_time = time.localtime(start_time)
start_time_stamp = '{:4d}{:02d}{:02d} {:02d}:{:02d}:{:02d}'.format(\
this_time.tm_year, this_time.tm_mon, this_time.tm_mday,\
this_time.tm_hour, this_time.tm_min, this_time.tm_sec)
cumulative_time = start_time - start_time
logging.info('Initial Setup: {:s}'.format(start_time_stamp))
def my_time():
global cumulative_time
time_taken = time.time() - start_time
incremental_time = time_taken - cumulative_time
cumulative_time = time_taken
logging.info("Started: %s Complete: Cumulative: %.4f s Incremental: %.4f s\n" \
% (start_time_stamp, cumulative_time, incremental_time) )
logging.basicConfig(filename='myunzip_task_log.txt',level=logging.DEBUG)
my_start_time()
logging.info('Initial Setup...')
def write_to_json(data, file):
value = False
with open(file, 'w') as f:
json.dump(json.dumps(data, sort_keys=True),f)
f.close()
value = True
return value
data_dir = r'L:\DataQA\Python Unzip Files\Source Zipped'
temp_dir = r'L:\DataQA\Python Unzip Files\temp1'
new_dir = r'L:\DataQA\Python Unzip Files\temp2'
final_dir = r'L:\DataQA\Python Unzip Files\Destination Unzipped files'
big_list = os.listdir(data_dir)
archive_count = 0
file_count = 152865
basename1 = os.path.join(final_dir,'GENERIC_ROUGHDRAFT')
basename2 = os.path.join(final_dir,'XACTDOC')
my_time()
archive_count = len(big_list)
logging.info('Unzipping {} archives...'.format(archive_count))
for folder in big_list:
prior_count = file_count
logging.info('Starting: {}'.format(folder))
try:
shutil.rmtree(temp_dir)
except FileNotFoundError:
pass
os.mkdir(temp_dir)
with zipfile.ZipFile(os.path.join(data_dir,folder),mode='r') as a_zip:
a_zip.extractall(path = temp_dir)
archive_count += 1
logging.info('Cumulative total of {} archive(s) unzipped'.format(archive_count))
bigger_list = os.listdir(temp_dir)
logging.info('Current archive contains {} subfolders'.format(len(bigger_list)))
for sub_folder in bigger_list:
with zipfile.ZipFile(os.path.join(temp_dir,sub_folder),mode='r') as b_zip:
b_zip.extractall(path = new_dir)
file1 = "%s (%d).%s" % (basename1, file_count, 'xml')
file2 = "%s (%d).%s" % (basename2, file_count, 'xml')
shutil.copy(os.path.join(new_dir, 'GENERIC_ROUGHDRAFT.xml'), file1)
shutil.copy(os.path.join(new_dir, 'XACTDOC.xml'), file2)
file_count += 1
logging.info('{} subfolders unzipped'.format(file_count - prior_count))
#os.remove(data_dir)
shutil.rmtree(data_dir)
os.mkdir(data_dir)
#os.unlink(data_dir)
my_time()
logging.info('Total of {0} files -- {1} pairs -- should be in {2}'.format(2*(file_count-1), file_count-1, final_dir))
time.sleep(1)
my_time()

in both zip archive open statements:
with zipfile.ZipFile(os.path.join(data_dir,folder),mode='r')
and
with zipfile.ZipFile(os.path.join(temp_dir,sub_folder),mode='r')
nothing (at least nothing that we can check) guarantees that the file names you're passing are actually .zip files. It could be a directory, an already extracted file, some file that was already there...
I suggest that you check the file extension prior to extracting, for instance:
import fnmatch
zfn = os.path.join(temp_dir,sub_folder)
if fnmatch.fnmatch(zfn,"*.zip"):
with zipfile.ZipFile(zfn,mode='r') as whatever:
Some .zip files could be corrupt, but that's less likely. Also, if you wanted to extract .jar and other zip-structured files with a different extension, replace the fnmatch by
if zfn.lower().endswith(('.zip','.jar','.docx')):

Related

Merging converted pdf files in aws lambda function

Trying to achieve below functionality:
Uploading multiple files to s3 bucket.
Non pdf files needs to get converted to pdf and then merge into single pdf file.
The folder structure will be folder1/2/3/4. under folder 4 the files gets uploaded.
Below is my code (AWS Lambda function) but the issue is the files (only some) are merging before all the files gets converted. Convert to pdf has to occur successfully before the merging starts.
import os
import io
from io import BytesIO
import tarfile
import boto3
import subprocess
import brotli
from PyPDF2 import PdfMerger
from time import sleep
#Directory where libre office open source s/w will be saved lambda tmp directory
LIBRE_OFFICE_INSTALL_DIR = '/tmp/instdir'
s3_bucket = boto3.resource("s3").Bucket("bucketname")
def load_libre_office():
if os.path.exists(LIBRE_OFFICE_INSTALL_DIR) and os.path.isdir(LIBRE_OFFICE_INSTALL_DIR):
print("Have a cached copy of LibreOffice, Skipping Extraction")
else:
print("No Cached copy of Libre Office exists , extracting tar stream from brotli file")
buffer = BytesIO()
with open('/opt/lo.tar.br','rb') as brotli_file:
decompressor = brotli.Decompressor()
while True:
chunk = brotli_file.read(1024)
buffer.write(decompressor.decompress(chunk))
if len(chunk) < 1024:
break
buffer.seek(0)
print('Extracting tar stream to /tmp for caching')
with tarfile.open(fileobj=buffer) as tar:
# TODO: write code...
print('opening tar file')
tar.extractall('/tmp')
print('LibreOffice caching done!')
return f'{LIBRE_OFFICE_INSTALL_DIR}/program/soffice.bin'
def convert_word_to_pdf(soffice_path, word_file_path, output_dir):
conv_cmd = f"{soffice_path} --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to pdf:writer_pdf_Export --outdir {output_dir} {word_file_path}"
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
return False
return True
def download_from_s3(bucket,key,download_path):
s3 = boto3.client('s3')
s3.download_file(bucket,key,download_path)
def upload_to_s3(file_pathn,bucket,key):
s3=boto3.client('s3')
s3.upload_file(file_pathn,bucket,key)
# Create an S3 client
s3 = boto3.client('s3')
bucket='bucketname'
prefix = 'media/files/'
def merge_pdfs(bucket, prefix):
# Get a list of all subdirectories in the specified prefix
result = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter='/')
# Get a list of all subdirectories
subdirectories = [prefix + obj['Prefix'].split('/')[-2] + '/' for obj in result.get('CommonPrefixes', [])]
# Loop through all subdirectories
for subdirectory in subdirectories:
# Get a list of all inner subdirectories in the subdirectory
inner_result = s3.list_objects_v2(Bucket=bucket, Prefix=subdirectory, Delimiter='/')
# Get a list of all inner subdirectories
inner_subdirectories = [subdirectory + obj['Prefix'].split('/')[-2] + '/' for obj in inner_result.get('CommonPrefixes', [])]
for inner_subdirectory in inner_subdirectories:
# Get a list of all PDF objects in the inner subdirectory
obj_list = s3.list_objects_v2(Bucket=bucket, Prefix=inner_subdirectory)
# Get a list of all PDF object keys in the inner subdirectory
keys = [obj['Key'] for obj in obj_list['Contents']]
#Create a PDF merger object
pdf_merger = PdfMerger()
newS3 = boto3.resource('s3')
bucket1 = newS3.Bucket(bucket)
# To check if mergedfile already exists
obj = list(bucket1.objects.filter(Prefix=inner_subdirectory+'newmerged.pdf'))
if len(obj) > 0:
print("Exists")
else:
print("Not Exists")
#Loop through all PDF objects in the inner subdirectory
print(len(keys))
for key in keys :
if key.endswith('.pdf'):
obj = s3.get_object(Bucket=bucket, Key=key)
pdf_content = obj['Body'].read()
pdf_merger.append(io.BytesIO(pdf_content))
y=io.BytesIO()
pdf_merger.write(y)
y.seek(0)
s3.put_object(Bucket=bucket, Key=inner_subdirectory+"newmerged.pdf", Body=y)
def lambda_handler(event, context):
print(event)
key = event['Records'][0]['s3']['object']['key']
key_prefix, base_name = os.path.split(key)
download_path = f"/tmp/{base_name}"
output_dir = "/tmp"
soffice_path = load_libre_office()
if not key.endswith('.pdf'):
download_from_s3(bucket, key, download_path)
is_converted = convert_word_to_pdf(soffice_path, download_path, output_dir)
print('isconverting')
if is_converted:
file_name, _ = os.path.splitext(base_name)
upload_to_s3(f"{output_dir}/{file_name}.pdf", bucket, f"{key_prefix}/{file_name}.pdf")
print('uploaded')
#sleep(100)
merge_pdfs(bucket,prefix)

How to copy only non-duplicate files whilst maintaining folder structure?

I am trying to find duplicates between two folders and copy only unique image files to the 'dest' folder. I can copy all the non-dupes using the code below, however it doesn't maintain the source directory structure. I think OS.walk returns 3 tuples, but they aren't linked so not sure how to re-construct the sub dir?
Example:
import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup \ destination folder:')
ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')
search = dif(source, dest)
result = search.result
result
dupes = []
srcfiles = []
filecount = []
failed = []
removed = []
for i in result.values():
dupes.append(i['location'])
for dirpath, subdirs, files in os.walk(source):
for x in files:
if x.endswith(ext):
srcfiles.append(os.path.join(dirpath, x))
for f in srcfiles:
if f not in dupes:
shutil.copy(f, dest)
print('File copied successfully - '+f)
filecount.append(f)
else:
print('File not copied successfully !!!! - '+f)
failed.append(f)
I have also tried using the shutil.copytree function with an ignore list, however it requires a new folder and can't get the ignore list function to work
shutil.copytree example:
for i in result.values():
df = []
df.append(i['filename'])
def ignorelist(source, df):
return [f for f in df if os.path.isfile(os.path.join(source, f))]
shutil.copytree(source, destnew, ignore=ignorelist)
This function ignorelist should do the trick:
import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup \ destination folder:')
ext = ('.jpg','.jpeg','.gif')
search = dif(source, dest)
dupes = [value['location'] for value in search.result.values()]
def ignorelist(source, files):
return [file for file in files
if (os.path.isfile(os.path.join(source, file))
and (os.path.join(source, file) in dupes
or not file.lower().endswith(ext)))]
shutil.copytree(source, dest, ignore=ignorelist)
And the other "more manual" way would be
import shutil, os
from difPy import dif
source = input('Input source folder:').rstrip('/\\')
dest = input('Input backup \ destination folder:').rstrip('/\\')
ext = ('.jpg','.jpeg','.gif')
search = dif(source, dest)
dupes = [value['location'] for value in search.result.values()]
srcfiles = []
copied = []
failed = []
skipped = []
for dirpath, subdirs, files in os.walk(source):
for file in files:
if file.lower().endswith(ext):
srcfile = os.path.join(dirpath,file)
srcfiles.append(srcfile)
if srcfile in dupes:
print('File not copied (duplicate) - '+srcfile)
skipped.append(srcfile)
else:
try:
destfile = os.path.join(dest,srcfile[len(source)+1:])
os.makedirs(os.path.dirname(destfile), exist_ok=True)
shutil.copy(srcfile,destfile)
print('File copied successfully - '+srcfile)
copied.append(srcfile)
except Exception as err:
print('File not copied (error %s) - %s' % (str(err),srcfile))
failed.append(f)
I have changed some variable names to make them more descriptive. And what you call failed is really just a list of files that are not copied because they are duplicates rather than files whose copying was attempted but failed.
import shutil, os
from difPy import dif
source = input('Input source folder: ')
dest = input('Input backup \ destination folder: ')
# Remove trailing path separators if they exist:
if source.endswith(('/', '\\')):
source = source[:-1]
if dest.endswith(('/', '\\')):
dest = dest[:-1]
# Use the correct path separator to
# ensure correct matching with dif results:
if os.sep == '/':
source = source.replace('\\', os.sep)
elif os.sep == '\\':
source = source.replace('/', os.sep)
source_directory_length = len(source) + 1
ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')
search = dif(source, dest)
result = search.result
# Set comprehension:
dupes = {duplicate['location'] for duplicate in result.values()}
copied = []
not_copied = []
for dirpath, subdirs, files in os.walk(source):
for file in files:
if file.endswith(ext):
source_path = os.path.join(dirpath, file)
if source_path not in dupes:
# get subdirectory of source directory that this file is in:
file_length = len(file) + 1
# Get subdirectory relative to the source directory:
subdirectory = source_path[source_directory_length:-file_length]
if subdirectory:
dest_directory = os.path.join(dest, subdirectory)
# ensure directory exists:
os.makedirs(dest_directory, exist_ok=True)
else:
dest_directory = dest
dest_path = os.path.join(dest_directory, file)
shutil.copy(source_path, dest_path)
print('File copied successfully -', source_path)
copied.append(source_path)
else:
print('File not copied -', source_path)
not_copied.append(source_path)

Ttrying to copy the content of files from source to destination, if file is .txt and if files have the same names and then ZIP each just copied file

I'm new to python and trying to copy the content of files from dir_A to dir_B. If file is .txt and if these files from dir_A and dir_B have the same names and after zip each of these newly copied files.
import os, shutil, zipfile
src_folder = "C:/Users/pushka/pythonApp1/src_folder"
dst_folder = "C:/Users/pushka/pythonApp1/dst_folder"
# only .txt files will be copied
ext = (".txt")
try:
for src_f in os.scandir(src_folder):
for dst_f in os.scandir(dst_folder):
if src_f.path.endswith(ext) and os.path.basename(src_f) == os.path.basename(dst_f):
# copy file
shutil.copyfile(src_f, dst_f)
finally:
print("The 'try except' is finished")
I have searched and tried several options to ZIP, but none of them work properly, so I need your help please
I modified your code a bit, but this should do the trick:
import os, shutil, zipfile
src_folder = "C:/Users/pushka/pythonApp1/src_folder"
dst_folder = "C:/Users/pushka/pythonApp1/dst_folder"
# only .txt files will be copied
ext = ".txt"
copied_files = []
for src_f in os.scandir(src_folder):
if src_f.name.endswith(ext) and not src_f.is_dir():
dst_f = os.path.join(dst_folder, src_f.name)
if not os.path.exists(dst_f):
shutil.copyfile(src_f, dst_f)
copied_files.append(dst_f)
print(copied_files)
zipfile_name = os.path.join(dst_folder, "copied_files.zip")
if not os.path.exists(zipfile_name):
with zipfile.ZipFile(zipfile_name, "w") as zf:
for txtfile in copied_files:
print("Writing " + txtfile)
zf.write(txtfile, os.path.split(txtfile)[-1])
It should be pretty self-explanatory, but I'll walk you through it. In the first for loop, we scan all entries in src_folder. If the name ends in .txt and it is not a directory, we create a path to the destination file. Then, as long as the destination file does not exist, we copy the source to the destination, and add the destination to the copied_files list.
After all the copying is done, we create the zip file's name. If it doesn't exist, we create it using the zipfile.ZipFile context manager and write in each copied file (from the destination, not the source), stripping the full path from it in the archive.
Please note that, by default, the zipfile uses ZIP_STORED as the compression format - i.e., the data is not compressed. See the docs for the other supported compression formats if you need a compressed archive.
Thanks a lot, but here is the answer to my own question with your help
import os, shutil, zipfile
src_folder = "C:/Users/pushka/pythonApp1/src_folder"
dst_folder = "C:/Users/pushka/pythonApp1/dst_folder"
# only .txt files will be copied
ext = ".txt"
copied_files = []
for src_f in os.scandir(src_folder):
for dst_f in os.scandir(dst_folder):
if src_f.name.endswith(ext) and os.path.basename(src_f) == os.path.basename(dst_f):
# copy file
shutil.copyfile(src_f, dst_f)
copied_files.append(dst_f)
print(copied_files)
for txt_file in copied_files:
file_root = os.path.splitext(txt_file)[0]
zip_file_name = file_root + '.zip'
with zipfile.ZipFile(zip_file_name, mode='w') as zf:
zf.write(txt_file, os.path.basename(txt_file))
Works as expected
Simple format:
from pathlib import Path
from typing import List
from zipfile import ZipFile
src_folder = Path("C:/Users/pushka/pythonApp1/src_folder")
dst_folder = Path("C:/Users/pushka/pythonApp1/dst_folder")
SUFFIX = ".txt"
def copy_file(from_path: Path, to_path: Path, copied) -> None:
content = from_path.read_bytes()
to_path.write_bytes(content)
copied.append(to_path)
print(f"Copy file: {from_path} --> {to_path}")
def zip_them(paths: List[Path]) -> str:
filename = "copied.zip"
with ZipFile(filename, "w") as z:
for path in paths:
z.write(path, path.name)
return filename
def main():
assert src_folder.exists(), f"path `{src_folder}` not found!"
assert dst_folder.exists(), f"path `{dst_folder}` not found!"
copied = []
for p in src_folder.glob(f"*{SUFFIX}"):
dst = dst_folder / p.name
copy_file(p, dst, copied)
fn = zip_them(copied)
print(f"There are {len(copied)} files copied. And zipped to: {fn}")
if __name__ == "__main__":
main()
My prefer:
from typing import List
from zipfile import ZipFile
import anyio # pip install anyio
from anyio import Path
src_folder = Path("C:/Users/pushka/pythonApp1/src_folder")
dst_folder = Path("C:/Users/pushka/pythonApp1/dst_folder")
async def copy_file(from_path: Path, to_path: Path, copied) -> None:
content = await from_path.read_bytes()
await to_path.write_bytes(content)
copied.append(to_path)
print(f"copy file: {from_path} --> {to_path}")
def zip_them(paths: List[Path]) -> str:
filename = "copied.zip"
with ZipFile(filename, "w") as z:
for path in paths:
z.write(path, path.name)
return filename
async def main():
copied = []
async with anyio.create_task_group() as tg:
async for p in src_folder.glob("*.txt"):
dst = dst_folder / p.name
tg.start_soon(copy_file, p, dst, copied)
fn = zip_them(copied)
print(f"zip file created: {fn}")
if __name__ == "__main__":
import timeit
cost = timeit.timeit("anyio.run(main)", number=1, globals=globals())
print("Cost:", round(cost, 2), "seconds.")

How do I fix this file_tracker that reads/writes using JSON dictionaries?

I am trying to write a script that tracks for changes made in directories/files set to multiple file paths created by an installer. I found Thomas Sileo's DirTools project on git, modified it, but am now running into some issues when writing/reading from JSON:
1) First, I believe that I am writing to JSON incorrectly and am finding that my create_state() function is only writing the last path I need.
2) If I get it working, I am unable to read/parse the file like I was before. I usually get ValueError: Extra data errors
Code below:
import os import json import getpass
files = [] subdirs = []
USER = getpass.getuser()
pathMac = ['/Applications/',
'/Users/' + USER + '/Documents/' ]
def create_dir_index(path):
files = []
subdirs = []
for root, dirs, filenames in os.walk(path):
for subdir in dirs:
subdirs.append(os.path.relpath(os.path.join(root, subdir), path))
for f in filenames:
files.append(os.path.relpath(os.path.join(root, f), path))
return dict(files=files, subdirs=subdirs)
def create_state(): for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("Manifest.json", "w")
json.dump(dir_state, out_file)
out_file.close()
def compare_states(dir_base, dir_cmp):
'''
return a comparison two manifest json files
'''
data = {}
data['deleted'] = list(set(dir_cmp['files']) - set(dir_base['files']))
data['created'] = list(set(dir_base['files']) - set(dir_cmp['files']))
data['deleted_dirs'] = list(set(dir_cmp['subdirs']) - set(dir_base['subdirs']))
data['created_dirs'] = list(set(dir_base['subdirs']) - set(dir_cmp['subdirs']))
return data
if __name__ == '__main__':
response = raw_input("Would you like to Compare or Create? ")
if response == "Create":
# CREATE MANIFEST json file
create_state()
print "Manifest file created."
elif response == "Compare":
# create the CURRENT state of all indexes in pathMac and write to json file
for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("CurrentState.json", "w")
json.dump(dir_state, out_file)
out_file.close()
# Open and Load the contents from the file into dictionaries
manifest = json.load(open("Manifest.json", "r"))
current = json.load(open("CurrentState.json", "r"))
print compare_states(current, manifest)

Python, Deleting all files in a folder older than X days

I'm trying to write a python script to delete all files in a folder older than X days. This is what I have so far:
import os, time, sys
path = r"c:\users\%myusername%\downloads"
now = time.time()
for f in os.listdir(path):
if os.stat(f).st_mtime < now - 7 * 86400:
if os.path.isfile(f):
os.remove(os.path.join(path, f))
When I run the script, I get:
Error2 - system cannot find the file specified,
and it gives the filename. What am I doing wrong?
os.listdir() returns a list of bare filenames. These do not have a full path, so you need to combine it with the path of the containing directory. You are doing this when you go to delete the file, but not when you stat the file (or when you do isfile() either).
Easiest solution is just to do it once at the top of your loop:
f = os.path.join(path, f)
Now f is the full path to the file and you just use f everywhere (change your remove() call to just use f too).
I think the new pathlib thingy together with the arrow module for dates make for neater code.
from pathlib import Path
import arrow
filesPath = r"C:\scratch\removeThem"
criticalTime = arrow.now().shift(hours=+5).shift(days=-7)
for item in Path(filesPath).glob('*'):
if item.is_file():
print (str(item.absolute()))
itemTime = arrow.get(item.stat().st_mtime)
if itemTime < criticalTime:
#remove it
pass
pathlib makes it easy to list the directory contents, to access file characteristics such as as creation times and to get full paths.
arrow makes calculations of times easier and neater.
Here's the output showing the full paths offered by pathlib. (No need to join.)
C:\scratch\removeThem\four.txt
C:\scratch\removeThem\one.txt
C:\scratch\removeThem\three.txt
C:\scratch\removeThem\two.txt
You need to give it the path also or it will look in cwd.. which ironically enough you did on the os.remove but no where else...
for f in os.listdir(path):
if os.stat(os.path.join(path,f)).st_mtime < now - 7 * 86400:
i did it in more sufficient way
import os, time
path = "/home/mansoor/Documents/clients/AirFinder/vendors"
now = time.time()
for filename in os.listdir(path):
filestamp = os.stat(os.path.join(path, filename)).st_mtime
filecompare = now - 7 * 86400
if filestamp < filecompare:
print(filename)
You need to use if os.stat(os.path.join(path, f)).st_mtime < now - 7 * 86400: instead of if os.stat(f).st_mtime < now - 7 * 86400:
I find using os.path.getmtime more convenient :-
import os, time
path = r"c:\users\%myusername%\downloads"
now = time.time()
for filename in os.listdir(path):
# if os.stat(os.path.join(path, filename)).st_mtime < now - 7 * 86400:
if os.path.getmtime(os.path.join(path, filename)) < now - 7 * 86400:
if os.path.isfile(os.path.join(path, filename)):
print(filename)
os.remove(os.path.join(path, filename))
A simple python script to remove /logs/ files older than 10 days
#!/usr/bin/python
# run by crontab
# removes any files in /logs/ older than 10 days
import os, sys, time
from subprocess import call
def get_file_directory(file):
return os.path.dirname(os.path.abspath(file))
now = time.time()
cutoff = now - (10 * 86400)
files = os.listdir(os.path.join(get_file_directory(__file__), "logs"))
file_path = os.path.join(get_file_directory(__file__), "logs/")
for xfile in files:
if os.path.isfile(str(file_path) + xfile):
t = os.stat(str(file_path) + xfile)
c = t.st_ctime
# delete file if older than 10 days
if c < cutoff:
os.remove(str(file_path) + xfile)
With __file__ you can replace by your path.
This deletes files older than 60 days.
import os
directory = '/home/coffee/Documents'
os.system("find " + directory + " -mtime +60 -print")
os.system("find " + directory + " -mtime +60 -delete")
With comprehensions, Can be:
import os
from time import time
p='.'
result=[os.remove(file) for file in (os.path.join(path, file) for path, _, files in os.walk(p) for file in files) if os.stat(file).st_mtime < time() - 7 * 86400]
print(result)
remove files with match = os.remove(file)
loop for all files into path = for file in
generation with all files = (os.path.join(path, file) for path, _,
files in os.walk(p) for file in files)
p is a directory into filesystem
verify mtime to match= if os.stat(file).st_mtime < time() - 7 * 86400
May be see: https://ideone.com/Bryj1l
Here's how I do it on my Windows machines. It uses shutil to also remove subdirectories created in downloads. I also have a similar one to keep the folders cleaned up on the hard drive of my son's computer, as he has special needs and tends to let things get out of control fast.
import os, time, shutil
paths = (("C:"+os.getenv('HOMEPATH')+"\Downloads"), (os.getenv('TEMP')))
oneday = (time.time())- 1 * 86400
try:
for path in paths:
for filename in os.listdir(path):
if os.path.getmtime(os.path.join(path, filename)) < oneday:
if os.path.isfile(os.path.join(path, filename)):
print(filename)
os.remove(os.path.join(path, filename))
elif os.path.isdir(os.path.join(path, filename)):
print(filename)
shutil.rmtree((os.path.join(path, filename)))
os.remove(os.path.join(path, filename))
except:
pass
print("Maintenance Complete!")
Some of the other answers also have the same code but i feel they have overcomplicated a very simple process
import os
import time
#folder to clear from
dir_path = 'path of directory to clean'
#No of days before which the files are to be deleted
limit_days = 10
treshold = time.time() - limit_days*86400
entries = os.listdir(dir_path)
for dir in entries:
creation_time = os.stat(os.path.join(dir_path,dir)).st_ctime
if creation_time < treshold:
print(f"{dir} is created on {time.ctime(creation_time)} and will be deleted")
I might be a tad late to the party but this is my approach using pathlib's timestamp to convert date object to a float and compare it to file.stat().st_mtime
from pathlib import Path
import datetime as dt
from time import ctime
remove_before = dt.datetime.now()-dt.timedelta(days=10) files older than 10 days
removeMe = Path.home() / 'downloads' # points to :\users\%myusername%\
for file in removeMe.iterdir():
if remove_before.timestamp() > file.stat().st_mtime:
print(ctime(file.stat().st_mtime))
file.unlink() # to delete the file
would like to add what i came up with to do this task.
the function is called in the login process.
def remove_files():
removed=0
path = "desired path"
# Check current working directory.
dir_to_search = os.getcwd()
print "Current working directory %s" % dir_to_search
#compare current to desired directory
if dir_to_search != "full desired path":
# Now change the directory
os.chdir( desired path )
# Check current working directory.
dir_to_search = os.getcwd()
print "Directory changed successfully %s" % dir_to_search
for dirpath, dirnames, filenames in os.walk(dir_to_search):
for file in filenames:
curpath = os.path.join(dirpath, file)
file_modified = datetime.datetime.fromtimestamp(os.path.getmtime(curpath))
if datetime.datetime.now() - file_modified > datetime.timedelta(hours=1):
os.remove(curpath)
removed+=1
print(removed)

Categories