LRU cache on hard drive python - python

I want to be able to decorate a function as you would do with functools.lru_cache, however, I want the results to be cached on the hard drive and not in memory. Looking around, I get a feeling this is a solved problem, and I was wondering if anyone could point me in the right direction (or at least give me a few more keywords to try googling)
I don't know if this will help or if it matters, but the function is computing images from unique filenames.

Here's some code to get you started:
from pathlib import Path
import pickle
import hashlib
import os
class LRU_Cache:
def __init__(self, directory, original_function, maxsize=10):
self.directory = directory
self.original_function = original_function
self.maxsize = maxsize
try:
os.mkdir(directory)
except OSError:
pass
def __call__(self, *args):
filename = hashlib.sha1(pickle.dumps(args)).hexdigest()
fullname = os.path.join(self.directory, filename)
try:
with open(fullname, 'rb') as f:
value = pickle.load(f)
Path(fullname).touch()
return value
except FileNotFoundError:
pass
value = self.original_function(*args)
with open(fullname, 'wb') as f:
pickle.dump(value, f)
filenames = os.listdir(self.directory)
if len(filenames) <= self.maxsize:
return
fullnames = [os.path.join(self.directory, filename)
for filename in filenames]
oldest = min(fullnames, key=lambda fn: os.stat(fn).st_mtime)
os.remove(oldest)
It uses hashes the arguments to create a unique filename for each function call. The function return value is pickled using that filename.
Cache hits unpickle the stored result and update the file modification time.
If the cache directory exceeds a target size, the oldest cache file is removed.
Use it like this:
def square(x):
print('!')
return x ** 2
sqr = LRU_Cache('square_cache', square, 10)
Now call sqr normally and results will be cached to disk.

Related

How to update a file inside a folder in a zipfile without unzipping the zip in Python? [duplicate]

I have archive.zip with two files: hello.txt and world.txt
I want to overwrite hello.txt file with new one with that code:
import zipfile
z = zipfile.ZipFile('archive.zip','a')
z.write('hello.txt')
z.close()
but it won't overwrite file, somehow it creates another instance of hello.txt — take a look at winzip screenshot:
Since there is no smth like zipfile.remove(), what's the best way to handle this problem?
There's no way to do that with python zipfile module. You have to create a new zip file and recompress everything again from the first file, plus the new modified file.
Below is some code to do just that. But note that it isn't efficient, since it decompresses and then recompresses all data.
import tempfile
import zipfile
import shutil
import os
def remove_from_zip(zipfname, *filenames):
tempdir = tempfile.mkdtemp()
try:
tempname = os.path.join(tempdir, 'new.zip')
with zipfile.ZipFile(zipfname, 'r') as zipread:
with zipfile.ZipFile(tempname, 'w') as zipwrite:
for item in zipread.infolist():
if item.filename not in filenames:
data = zipread.read(item.filename)
zipwrite.writestr(item, data)
shutil.move(tempname, zipfname)
finally:
shutil.rmtree(tempdir)
Usage:
remove_from_zip('archive.zip', 'hello.txt')
with zipfile.ZipFile('archive.zip', 'a') as z:
z.write('hello.txt')
Building on nosklo's answer.
UpdateableZipFile A class that inherits from ZipFile, maintians the same interface but adds the ability to overwrite files (via writestr or write) and removing files.
import os
import shutil
import tempfile
from zipfile import ZipFile, ZIP_STORED, ZipInfo
class UpdateableZipFile(ZipFile):
"""
Add delete (via remove_file) and update (via writestr and write methods)
To enable update features use UpdateableZipFile with the 'with statement',
Upon __exit__ (if updates were applied) a new zip file will override the exiting one with the updates
"""
class DeleteMarker(object):
pass
def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=False):
# Init base
super(UpdateableZipFile, self).__init__(file, mode=mode,
compression=compression,
allowZip64=allowZip64)
# track file to override in zip
self._replace = {}
# Whether the with statement was called
self._allow_updates = False
def writestr(self, zinfo_or_arcname, bytes, compress_type=None):
if isinstance(zinfo_or_arcname, ZipInfo):
name = zinfo_or_arcname.filename
else:
name = zinfo_or_arcname
# If the file exits, and needs to be overridden,
# mark the entry, and create a temp-file for it
# we allow this only if the with statement is used
if self._allow_updates and name in self.namelist():
temp_file = self._replace[name] = self._replace.get(name,
tempfile.TemporaryFile())
temp_file.write(bytes)
# Otherwise just act normally
else:
super(UpdateableZipFile, self).writestr(zinfo_or_arcname,
bytes, compress_type=compress_type)
def write(self, filename, arcname=None, compress_type=None):
arcname = arcname or filename
# If the file exits, and needs to be overridden,
# mark the entry, and create a temp-file for it
# we allow this only if the with statement is used
if self._allow_updates and arcname in self.namelist():
temp_file = self._replace[arcname] = self._replace.get(arcname,
tempfile.TemporaryFile())
with open(filename, "rb") as source:
shutil.copyfileobj(source, temp_file)
# Otherwise just act normally
else:
super(UpdateableZipFile, self).write(filename,
arcname=arcname, compress_type=compress_type)
def __enter__(self):
# Allow updates
self._allow_updates = True
return self
def __exit__(self, exc_type, exc_val, exc_tb):
# call base to close zip file, organically
try:
super(UpdateableZipFile, self).__exit__(exc_type, exc_val, exc_tb)
if len(self._replace) > 0:
self._rebuild_zip()
finally:
# In case rebuild zip failed,
# be sure to still release all the temp files
self._close_all_temp_files()
self._allow_updates = False
def _close_all_temp_files(self):
for temp_file in self._replace.itervalues():
if hasattr(temp_file, 'close'):
temp_file.close()
def remove_file(self, path):
self._replace[path] = self.DeleteMarker()
def _rebuild_zip(self):
tempdir = tempfile.mkdtemp()
try:
temp_zip_path = os.path.join(tempdir, 'new.zip')
with ZipFile(self.filename, 'r') as zip_read:
# Create new zip with assigned properties
with ZipFile(temp_zip_path, 'w', compression=self.compression,
allowZip64=self._allowZip64) as zip_write:
for item in zip_read.infolist():
# Check if the file should be replaced / or deleted
replacement = self._replace.get(item.filename, None)
# If marked for deletion, do not copy file to new zipfile
if isinstance(replacement, self.DeleteMarker):
del self._replace[item.filename]
continue
# If marked for replacement, copy temp_file, instead of old file
elif replacement is not None:
del self._replace[item.filename]
# Write replacement to archive,
# and then close it (deleting the temp file)
replacement.seek(0)
data = replacement.read()
replacement.close()
else:
data = zip_read.read(item.filename)
zip_write.writestr(item, data)
# Override the archive with the updated one
shutil.move(temp_zip_path, self.filename)
finally:
shutil.rmtree(tempdir)
usage example:
with UpdateableZipFile("C:\Temp\Test2.docx", "a") as o:
# Overwrite a file with a string
o.writestr("word/document.xml", "Some data")
# exclude an exiting file from the zip
o.remove_file("word/fontTable.xml")
# Write a new file (with no conflict) to the zp
o.writestr("new_file", "more data")
# Overwrite a file with a file
o.write(r"C:\Temp\example.png", "word/settings.xml")
Based on this answer here's a quick and dirty way to monkey patch stock zipfile to support file deletion (while we waiting for it being accepted for python:main):
from zipfile import ZipFile, ZipInfo
from operator import attrgetter
import functools
def enable_zip_remove(func):
def _zipfile_remove_member(self, member):
# get a sorted filelist by header offset, in case the dir order
# doesn't match the actual entry order
fp = self.fp
entry_offset = 0
filelist = sorted(self.filelist, key=attrgetter('header_offset'))
for i in range(len(filelist)):
info = filelist[i]
# find the target member
if info.header_offset < member.header_offset:
continue
# get the total size of the entry
entry_size = None
if i == len(filelist) - 1:
entry_size = self.start_dir - info.header_offset
else:
entry_size = filelist[i + 1].header_offset - info.header_offset
# found the member, set the entry offset
if member == info:
entry_offset = entry_size
continue
# Move entry
# read the actual entry data
fp.seek(info.header_offset)
entry_data = fp.read(entry_size)
# update the header
info.header_offset -= entry_offset
# write the entry to the new position
fp.seek(info.header_offset)
fp.write(entry_data)
fp.flush()
# update state
self.start_dir -= entry_offset
self.filelist.remove(member)
del self.NameToInfo[member.filename]
self._didModify = True
# seek to the start of the central dir
fp.seek(self.start_dir)
def zipfile_remove(self, member):
"""Remove a file from the archive. The archive must be open with mode 'a'"""
if self.mode != 'a':
raise RuntimeError("remove() requires mode 'a'")
if not self.fp:
raise ValueError(
"Attempt to write to ZIP archive that was already closed")
if self._writing:
raise ValueError(
"Can't write to ZIP archive while an open writing handle exists."
)
# Make sure we have an info object
if isinstance(member, ZipInfo):
# 'member' is already an info object
zinfo = member
else:
# get the info object
zinfo = self.getinfo(member)
return self._zipfile_remove_member(zinfo)
#functools.wraps(func)
def wrapper(*args, **kwargs):
if not hasattr(ZipFile, "remove"):
setattr(ZipFile, "_zipfile_remove_member", _zipfile_remove_member)
setattr(ZipFile, "remove", zipfile_remove)
return func(*args, **kwargs)
return wrapper
Usage:
#enable_zip_remove
def replace_zip_file():
with ZipFile("archive.zip", "a") as z:
z.remove("hello.txt")
z.write("hello.txt")
P.S. NSFW
My solution is similar to the other answers but uses SQLite to manage the intermediate files and provides __getitem__, __setitem__ and __delitem__ for an easy interface.
By default the db is in-memory but you can provide a temp file path if you have a zip larger than available memory.
And of course SQLite is built into Python and faster than the file system
import sqlite3
import subprocess
import zipfile
from pathlib import Path
from sql import CREATE_TABLE, DELETE_FILE, INSERT_FILE, SELECT_CONTENT
class EditableZip:
"""Intended to make editing files inside zip archive easy, this class is capable of loading files
from a zip file into a sqlite database, facilitates editing/removing/adding files, and saving
to a zip.
The database can be in-memory (default) or in a temporary on disk file if
temp_db_path is provided.
If an on-disk file is used, EditableZip.close can be called to remove the file or EditableZip
can be used as a context manager.
If auto_save is set to True and an initial zip_path was provided then the file will
be overwritten when EditableZip closes. If you wish to save to a different file,
or no zip_path is used in instantiation, auto_save can take a file path.
Files can be added by item assignment
with EditableZip(auto_save="example.zip") as ez:
ez["thing.txt"] = "stuff"
# empty dir
ez["empty/"] = None
Assignment accepts Non-text files as bytes.
EditableZip is subscriptable. If the subscript is a path in the db, the data will be returned.
EditableZip.files can be used to iterate over files in the db.
"""
def __init__(
self,
zip_path: None | str | Path = None,
temp_db_path: None | Path = None,
auto_save: bool | str | Path = False,
):
self.temp_db_path, self.auto_save, self.file_path = (
temp_db_path,
auto_save,
zip_path,
)
self.db = sqlite3.connect(
str(temp_db_path if temp_db_path is not None else ":memory:")
)
self.db.execute(CREATE_TABLE)
if self.file_path:
self.load(self.file_path)
#property
def files(self):
"Returns a generator of all file paths in the database."
try:
return (
i[0] for i in self.db.execute("SELECT file_path FROM files").fetchall()
)
except TypeError:
return None
def load(self, zip_path: str | Path) -> None:
"Add all files from zip at zip_path to db."
with zipfile.ZipFile(zip_path, mode="r") as archive:
for item in archive.infolist():
self[item.filename] = (
None if item.filename[-1] == "/" else archive.read(item)
)
def save(self, zip_path: None | str | Path) -> Path:
"Save all files from db to zip at zip_path."
zip_path = self.file_path if zip_path is None else zip_path
with zipfile.ZipFile(zip_path, "w") as archive:
for file in self.files:
if file_data := self.fetch(file):
archive.writestr(file, file_data)
else:
archive.writestr(zipfile.ZipInfo(file), "")
return zip_path
def close(self):
"Auto save if applicable and close + remove db."
if self.auto_save:
self.save(
zip_path=self.auto_save
if isinstance(self.auto_save, (str, Path))
else None
)
self.db.close()
if isinstance(self.temp_db_path, Path):
self.temp_db_path.unlink(missing_ok=True)
def fetch(self, file_path: str) -> bytes:
"Get content of db file for file_path."
try:
return self.db.execute(SELECT_CONTENT, {"file_path": file_path}).fetchone()[
0
]
except TypeError:
return None
def __getitem__(self, key):
result = self.fetch(key)
try:
return result.decode("utf-8")
except AttributeError:
return result
def __setitem__(self, file_path, content: str | bytes):
if isinstance(content, str):
content = content.encode("utf-8")
self.db.execute(
INSERT_FILE,
{"file_path": file_path, "file_content": content},
)
def __delitem__(self, file_path):
self.db.execute(DELETE_FILE, {"file_path": file_path})
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
if __name__ == "__main__":
# A use case: editing epub files.
# File source:
# https://archiveofourown.org/downloads/13795605/Victoria%20Potter%20and%20the.epub?updated_at=1650231615
file_path = Path("Victoria Potter and the.epub")
new_file = (file_path.parent / (file_path.stem + "- lowercase")).with_suffix(
file_path.suffix
)
# Create a copy of the epub with all letters lowercase
with EditableZip(zip_path=file_path, auto_save=new_file) as ez:
for file in ez.files:
if Path(file).suffix in [".html", ".xhtml"]:
ez[file] = ez[file].lower()
Reference: Delete file from zipfile with the ZipFile Module
In short,
You can take the code from https://github.com/python/cpython/blob/659eb048cc9cac73c46349eb29845bc5cd630f09/Lib/zipfile.py and create a separate file from it. After that just reference it from your project instead of built-in python library: import myproject.zipfile as zipfile.
Usage:
with zipfile.ZipFile(f"archive.zip", "a") as z:
z.remove(f"firstfile.txt")

read a dict into self

I have a class that is doing a lot of stuff. In the end, it saves everything into a pickle. When I rerun this class I want to read the pickle instead of doing everything again. Unfortunately it the variable is always empty if I unpickle. Why is that so?
import pandas as pd
class Test:
def __init__(path, value):
# path points to a .txt file but its in the same folder as the pickle
data_path, data = os.path.split(path)
pickle_path = os.path.join(data_path, name.split('.')[1] + '.pickle'
if os.path.isfile(pickle_path):
self = pd.read_pickle(path)
else:
# do a ton of stuff and safe it as pickle afterwards
variable = Test(path, value)
In this case variable is empty if I read from pickle but correct if I do all the stuff...
If I want to cache some calculation results I will load/dump the object outside the class, something like,
pickle_path = os.path.join(data_path, name.split('.')[1] + '.pickle'
if os.path.isfile(pickle_path):
with open(pickle_path, 'rb') as f:
variable = pickle.load(f) # use cached results
else:
variable = Test() # do all the calculations

grab next .zip file in folder (iterate through zip directory)

Below is my most recent attempt; but alas, I print 'current_file' and it's always the same (first) .zip file in my directory?
Why/how can I iterate this to get to the next file in my zip directory?
my DIRECTORY_LOCATION has 4 zip files in it.
def find_file(cls):
listOfFiles = os.listdir(config.DIRECTORY_LOCATION)
total_files = 0
for entry in listOfFiles:
total_files += 1
# if fnmatch.fnmatch(entry, pattern):
current_file = entry
print (current_file)
""""Finds the excel file to process"""
archive = ZipFile(config.DIRECTORY_LOCATION + "/" + current_file)
for file in archive.filelist:
if file.filename.__contains__('Contact Frog'):
return archive.extract(file.filename, config.UNZIP_LOCATION)
return FileNotFoundError
find_file usage:
excel_data = pandas.read_excel(self.find_file())
Update:
I just tried changing return to yield at:
yield archive.extract(file.filename, config.UNZIP_LOCATION)
and now getting the below error at my find_file line.
ValueError: Invalid file path or buffer object type: <class 'generator'>
then I alter with the generator obj as suggested in comments; i.e.:
generator = self.find_file(); excel_data = pandas.read_excel(generator())
and now getting this error:
generator = self.find_file(); excel_data = pandas.read_excel(generator())
TypeError: 'generator' object is not callable
Here is my /main.py if helpful
"""Start Point"""
from data.find_pending_records import FindPendingRecords
from vital.vital_entry import VitalEntry
import sys
import os
import config
import datetime
# from csv import DictWriter
if __name__ == "__main__":
try:
for file in os.listdir(config.DIRECTORY_LOCATION):
if 'VCCS' in file:
PENDING_RECORDS = FindPendingRecords().get_excel_data()
# Do operations on PENDING_RECORDS
# Reads excel to map data from excel to vital
MAP_DATA = FindPendingRecords().get_mapping_data()
# Configures Driver
VITAL_ENTRY = VitalEntry()
# Start chrome and navigate to vital website
VITAL_ENTRY.instantiate_chrome()
# Begin processing Records
VITAL_ENTRY.process_records(PENDING_RECORDS, MAP_DATA)
except:
print("exception occured")
raise
It is not tested.
def find_file(cls):
listOfFiles = os.listdir(config.DIRECTORY_LOCATION)
total_files = 0
for entry in listOfFiles:
total_files += 1
# if fnmatch.fnmatch(entry, pattern):
current_file = entry
print (current_file)
""""Finds the excel file to process"""
archive = ZipFile(config.DIRECTORY_LOCATION + "/" + current_file)
for file in archive.filelist:
if file.filename.__contains__('Contact Frog'):
yield archive.extract(file.filename, config.UNZIP_LOCATION)
This is just your function rewritten with yield instead of return.
I think it should be used in the following way:
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
#do whatever you want to do with excel_data here
self.find_file() is a generator, should be used like an iterator (read this answer for more details).
Try to integrate the previous loop in your main script. Each iteration of the loop, it will read a different file in excel_data, so in the body of the loop you should also do whatever you need to do with the data.
Not sure what you mean by:
just one each time the script is executed
Even with yield, if you execute the script multiple times, you will always start from the beginning (and always get the first file). You should read all of the files in the same execution.

Class for file creation and directory validation

After reading some texts regarding creation of files under python, i decided to create this class which creates a new file on a directory, and creating a backup on the other directory if the file already exists (and if it's older than x hours )
The main reason i opened this question is to know if this is a correct way to write a class using try/except correctly, because actually i'm getting a little confused about the preference of using try/except instead if/elses.
Bellow, the working example:
import os
import datetime
class CreateXML():
def __init__(self, path, filename):
self.path = path
self.bkp_path = "%s\\backup" % path
self.filename = filename
self.bkp_file = "%s.previous" % filename
self.create_check = datetime.datetime.now()-datetime.timedelta(hours=-8)
#staticmethod
def create_dir(path):
try:
os.makedirs(path)
return True
except:
return False
#staticmethod
def file_check(file):
try:
open(file)
return True
except:
return False
def create_file(self, target_dir, target_file):
try:
target = "%s\\%s" % (target_dir, target_file)
open(target, 'w')
except:
return False
def start_creation(self):
try:
# Check if file exists
if self.file_check("%s\\%s" % (self.path, self.filename)):
self.create_dir(self.bkp_path)
creation = os.path.getmtime("%s\\%s" % (self.path, self.filename))
fcdata = datetime.datetime.fromtimestamp(creation)
# File exists and its older than 8 hours
if fcdata < self.create_check:
bkp_file_path = "%s\\%s " % (self.bkp_path, self.bkp_file)
new_file_path = "%s\\%s " % (self.path, self.filename)
# If backup file exists, erase current backup file
# Move existing file to backup and create new file.
if self.file_check("%s\\%s" % (self.bkp_path, self.bkp_file)):
os.remove(bkp_file_path)
os.rename(new_file_path, bkp_file_path)
self.create_file(self.bkp_path, self.bkp_file)
#No backup file, create new one.
else:
self.create_file(self.bkp_path, self.bkp_file)
else:
# Fresh creation
self.create_dir(self.path)
self.create_file(self.path, self.filename)
except OSError, e:
print e
if __name__ == '__main__':
path = 'c:\\tempdata'
filename = 'somefile.txt'
cx = CreateXML(path, filename)
cx.start_creation()
So, basically the real question here is:
-With the example above, the usage of try/except is correct?
-It's correct to perform the validations using try/except to check if file or directory allready exists? instead using a simplified version like this one:
import os
# Simple method of doing it
path = 'c:\\tempdata'
filename = 'somefile.txt'
bkp_path = 'c:\\tempdata\\backup'
bkp_file = 'somefile.txt.bkp'
new_file_path = "%s\\%s" % (path, filename)
bkp_file_path = "%s\\%s" % (bkp_path, bkp_file)
if not os.path.exists(path):
print "create path"
os.makedirs(bkp_path)
if not os.path.isfile(new_file_path):
print "create new file"
open(new_file_path, 'w')
else:
print"file exists, moving to backup folder"
#check if backup file exists
if not os.path.isfile(bkp_file_path):
print "New backup file created"
open(bkp_file_path, 'w')
else:
print "backup exists, removing backup, backup the current, and creating newfile"
os.remove(bkp_file_path)
os.rename(new_file_path, bkp_file_path)
open(bkp_file_path, 'w')
-If the usage of try/except is correct, its recomended write an a big class to create a file if it's possible to write a short version of it?
Please do not close this tread, since i'm really confused about what is the "most correct pythonic way to do it".
Thanks in advance.

Finding duplicate files and removing them

I am writing a Python program to find and remove duplicate files from a folder.
I have multiple copies of mp3 files, and some other files. I am using the sh1 algorithm.
How can I find these duplicate files and remove them?
Fastest algorithm - 100x performance increase compared to the accepted answer (really :))
The approaches in the other solutions are very cool, but they forget about an important property of duplicate files - they have the same file size. Calculating the expensive hash only on files with the same size will save tremendous amount of CPU; performance comparisons at the end, here's the explanation.
Iterating on the solid answers given by #nosklo and borrowing the idea of #Raffi to have a fast hash of just the beginning of each file, and calculating the full one only on collisions in the fast hash, here are the steps:
Buildup a hash table of the files, where the filesize is the key.
For files with the same size, create a hash table with the hash of their first 1024 bytes; non-colliding elements are unique
For files with the same hash on the first 1k bytes, calculate the hash on the full contents - files with matching ones are NOT unique.
The code:
#!/usr/bin/env python3
from collections import defaultdict
import hashlib
import os
import sys
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
hashobj = hash()
file_object = open(filename, 'rb')
if first_chunk_only:
hashobj.update(file_object.read(1024))
else:
for chunk in chunk_reader(file_object):
hashobj.update(chunk)
hashed = hashobj.digest()
file_object.close()
return hashed
def check_for_duplicates(paths, hash=hashlib.sha1):
hashes_by_size = defaultdict(list) # dict of size_in_bytes: [full_path_to_file1, full_path_to_file2, ]
hashes_on_1k = defaultdict(list) # dict of (hash1k, size_in_bytes): [full_path_to_file1, full_path_to_file2, ]
hashes_full = {} # dict of full_file_hash: full_path_to_file_string
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
# get all files that have the same size - they are the collision candidates
for filename in filenames:
full_path = os.path.join(dirpath, filename)
try:
# if the target is a symlink (soft one), this will
# dereference it - change the value to the actual target file
full_path = os.path.realpath(full_path)
file_size = os.path.getsize(full_path)
hashes_by_size[file_size].append(full_path)
except (OSError,):
# not accessible (permissions, etc) - pass on
continue
# For all files with the same file size, get their hash on the 1st 1024 bytes only
for size_in_bytes, files in hashes_by_size.items():
if len(files) < 2:
continue # this file size is unique, no need to spend CPU cycles on it
for filename in files:
try:
small_hash = get_hash(filename, first_chunk_only=True)
# the key is the hash on the first 1024 bytes plus the size - to
# avoid collisions on equal hashes in the first part of the file
# credits to #Futal for the optimization
hashes_on_1k[(small_hash, size_in_bytes)].append(filename)
except (OSError,):
# the file access might've changed till the exec point got here
continue
# For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates
for __, files_list in hashes_on_1k.items():
if len(files_list) < 2:
continue # this hash of fist 1k file bytes is unique, no need to spend cpy cycles on it
for filename in files_list:
try:
full_hash = get_hash(filename, first_chunk_only=False)
duplicate = hashes_full.get(full_hash)
if duplicate:
print("Duplicate found: {} and {}".format(filename, duplicate))
else:
hashes_full[full_hash] = filename
except (OSError,):
# the file access might've changed till the exec point got here
continue
if __name__ == "__main__":
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print("Please pass the paths to check as parameters to the script")
And, here's the fun part - performance comparisons.
Baseline -
a directory with 1047 files, 32 mp4, 1015 - jpg, total size - 5445.998 MiB - i.e. my phone's camera auto upload directory :)
small (but fully functional) processor - 1600 BogoMIPS, 1.2 GHz 32L1 + 256L2 Kbs cache, /proc/cpuinfo:
Processor : Feroceon 88FR131 rev 1 (v5l)
BogoMIPS : 1599.07
(i.e. my low-end NAS :), running Python 2.7.11.
So, the output of #nosklo's very handy solution:
root#NAS:InstantUpload# time ~/scripts/checkDuplicates.py
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg
real 5m44.198s
user 4m44.550s
sys 0m33.530s
And, here's the version with filter on size check, then small hashes, and finally full hash if collisions are found:
root#NAS:InstantUpload# time ~/scripts/checkDuplicatesSmallHash.py . "/i-data/51608399/photo/Todor phone"
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg
real 0m1.398s
user 0m1.200s
sys 0m0.080s
Both versions were ran 3 times each, to get the avg of the time needed.
So v1 is (user+sys) 284s, the other - 2s; quite a diff, huh :)
With this increase, one could go to SHA512, or even fancier - the perf penalty will be mitigated by the less calculations needed.
Negatives:
More disk access than the other versions - every file is accessed once for size stats (that's cheap, but still is disk IO), and every duplicate is opened twice (for the small first 1k bytes hash, and for the full contents hash)
Will consume more memory due to storing the hash tables runtime
Recursive folders version:
This version uses the file size and a hash of the contents to find duplicates.
You can pass it multiple paths, it will scan all paths recursively and report all duplicates found.
import sys
import os
import hashlib
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def check_for_duplicates(paths, hash=hashlib.sha1):
hashes = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
hashobj = hash()
for chunk in chunk_reader(open(full_path, 'rb')):
hashobj.update(chunk)
file_id = (hashobj.digest(), os.path.getsize(full_path))
duplicate = hashes.get(file_id, None)
if duplicate:
print "Duplicate found: %s and %s" % (full_path, duplicate)
else:
hashes[file_id] = full_path
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print "Please pass the paths to check as parameters to the script"
def remove_duplicates(dir):
unique = []
for filename in os.listdir(dir):
if os.path.isfile(filename):
filehash = md5.md5(file(filename).read()).hexdigest()
if filehash not in unique:
unique.append(filehash)
else:
os.remove(filename)
//edit:
For MP3 you may be also interested in this topic Detect duplicate MP3 files with different bitrates and/or different ID3 tags?
I wrote one in Python some time ago -- you're welcome to use it.
import sys
import os
import hashlib
check_path = (lambda filepath, hashes, p = sys.stdout.write:
(lambda hash = hashlib.sha1 (file (filepath).read ()).hexdigest ():
((hash in hashes) and (p ('DUPLICATE FILE\n'
' %s\n'
'of %s\n' % (filepath, hashes[hash])))
or hashes.setdefault (hash, filepath)))())
scan = (lambda dirpath, hashes = {}:
map (lambda (root, dirs, files):
map (lambda filename: check_path (os.path.join (root, filename), hashes), files), os.walk (dirpath)))
((len (sys.argv) > 1) and scan (sys.argv[1]))
Faster algorithm
In case many files of 'big size' should be analyzed (images, mp3, pdf documents), it would be interesting/faster to have the following comparison algorithm:
a first fast hash is performed on the first N bytes of the file (say 1KB). This hash would say if files are different without doubt, but will not say if two files are exactly the same (accuracy of the hash, limited data read from disk)
a second, slower, hash, which is more accurate and performed on the whole content of the file, if a collision occurs in the first stage
Here is an implementation of this algorithm:
import hashlib
def Checksum(current_file_name, check_type = 'sha512', first_block = False):
"""Computes the hash for the given file. If first_block is True,
only the first block of size size_block is hashed."""
size_block = 1024 * 1024 # The first N bytes (1KB)
d = {'sha1' : hashlib.sha1, 'md5': hashlib.md5, 'sha512': hashlib.sha512}
if(not d.has_key(check_type)):
raise Exception("Unknown checksum method")
file_size = os.stat(current_file_name)[stat.ST_SIZE]
with file(current_file_name, 'rb') as f:
key = d[check_type].__call__()
while True:
s = f.read(size_block)
key.update(s)
file_size -= size_block
if(len(s) < size_block or first_block):
break
return key.hexdigest().upper()
def find_duplicates(files):
"""Find duplicates among a set of files.
The implementation uses two types of hashes:
- A small and fast one one the first block of the file (first 1KB),
- and in case of collision a complete hash on the file. The complete hash
is not computed twice.
It flushes the files that seems to have the same content
(according to the hash method) at the end.
"""
print 'Analyzing', len(files), 'files'
# this dictionary will receive small hashes
d = {}
# this dictionary will receive full hashes. It is filled
# only in case of collision on the small hash (contains at least two
# elements)
duplicates = {}
for f in files:
# small hash to be fast
check = Checksum(f, first_block = True, check_type = 'sha1')
if(not d.has_key(check)):
# d[check] is a list of files that have the same small hash
d[check] = [(f, None)]
else:
l = d[check]
l.append((f, None))
for index, (ff, checkfull) in enumerate(l):
if(checkfull is None):
# computes the full hash in case of collision
checkfull = Checksum(ff, first_block = False)
l[index] = (ff, checkfull)
# for each new full hash computed, check if their is
# a collision in the duplicate dictionary.
if(not duplicates.has_key(checkfull)):
duplicates[checkfull] = [ff]
else:
duplicates[checkfull].append(ff)
# prints the detected duplicates
if(len(duplicates) != 0):
print
print "The following files have the same sha512 hash"
for h, lf in duplicates.items():
if(len(lf)==1):
continue
print 'Hash value', h
for f in lf:
print '\t', f.encode('unicode_escape') if \
type(f) is types.UnicodeType else f
return duplicates
The find_duplicates function takes a list of files. This way, it is also possible to compare two directories (for instance, to better synchronize their content.) An example of function creating a list of files, with specified extension, and avoiding entering in some directories, is below:
def getFiles(_path, extensions = ['.png'],
subdirs = False, avoid_directories = None):
"""Returns the list of files in the path :'_path',
of extension in 'extensions'. 'subdir' indicates if
the search should also be performed in the subdirectories.
If extensions = [] or None, all files are returned.
avoid_directories: if set, do not parse subdirectories that
match any element of avoid_directories."""
l = []
extensions = [p.lower() for p in extensions] if not extensions is None \
else None
for root, dirs, files in os.walk(_path, topdown=True):
for name in files:
if(extensions is None or len(extensions) == 0 or \
os.path.splitext(name)[1].lower() in extensions):
l.append(os.path.join(root, name))
if(not subdirs):
while(len(dirs) > 0):
dirs.pop()
elif(not avoid_directories is None):
for d in avoid_directories:
if(d in dirs): dirs.remove(d)
return l
This method is convenient for not parsing .svn paths for instance, which surely will trigger colliding files in find_duplicates.
Feedbacks are welcome.
#IanLee1521 has a nice solution here. It is very efficient because it checks the duplicate based on the file size first.
#! /usr/bin/env python
# Originally taken from:
# http://www.pythoncentral.io/finding-duplicate-files-with-python/
# Original Auther: Andres Torres
# Adapted to only compute the md5sum of files with the same size
import argparse
import os
import sys
import hashlib
def find_duplicates(folders):
"""
Takes in an iterable of folders and prints & returns the duplicate files
"""
dup_size = {}
for i in folders:
# Iterate the folders given
if os.path.exists(i):
# Find the duplicated files and append them to dup_size
join_dicts(dup_size, find_duplicate_size(i))
else:
print('%s is not a valid path, please verify' % i)
return {}
print('Comparing files with the same size...')
dups = {}
for dup_list in dup_size.values():
if len(dup_list) > 1:
join_dicts(dups, find_duplicate_hash(dup_list))
print_results(dups)
return dups
def find_duplicate_size(parent_dir):
# Dups in format {hash:[names]}
dups = {}
for dirName, subdirs, fileList in os.walk(parent_dir):
print('Scanning %s...' % dirName)
for filename in fileList:
# Get the path to the file
path = os.path.join(dirName, filename)
# Check to make sure the path is valid.
if not os.path.exists(path):
continue
# Calculate sizes
file_size = os.path.getsize(path)
# Add or append the file path
if file_size in dups:
dups[file_size].append(path)
else:
dups[file_size] = [path]
return dups
def find_duplicate_hash(file_list):
print('Comparing: ')
for filename in file_list:
print(' {}'.format(filename))
dups = {}
for path in file_list:
file_hash = hashfile(path)
if file_hash in dups:
dups[file_hash].append(path)
else:
dups[file_hash] = [path]
return dups
# Joins two dictionaries
def join_dicts(dict1, dict2):
for key in dict2.keys():
if key in dict1:
dict1[key] = dict1[key] + dict2[key]
else:
dict1[key] = dict2[key]
def hashfile(path, blocksize=65536):
afile = open(path, 'rb')
hasher = hashlib.md5()
buf = afile.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(blocksize)
afile.close()
return hasher.hexdigest()
def print_results(dict1):
results = list(filter(lambda x: len(x) > 1, dict1.values()))
if len(results) > 0:
print('Duplicates Found:')
print(
'The following files are identical. The name could differ, but the'
' content is identical'
)
print('___________________')
for result in results:
for subresult in result:
print('\t\t%s' % subresult)
print('___________________')
else:
print('No duplicate files found.')
def main():
parser = argparse.ArgumentParser(description='Find duplicate files')
parser.add_argument(
'folders', metavar='dir', type=str, nargs='+',
help='A directory to parse for duplicates',
)
args = parser.parse_args()
find_duplicates(args.folders)
if __name__ == '__main__':
sys.exit(main())
import hashlib
import os
import sys
from sets import Set
def read_chunk(fobj, chunk_size = 2048):
""" Files can be huge so read them in chunks of bytes. """
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def remove_duplicates(dir, hashfun = hashlib.sha512):
unique = Set()
for filename in os.listdir(dir):
filepath = os.path.join(dir, filename)
if os.path.isfile(filepath):
hashobj = hashfun()
for chunk in read_chunk(open(filepath,'rb')):
hashobj.update(chunk)
# the size of the hashobj is constant
# print "hashfun: ", hashfun.__sizeof__()
hashfile = hashobj.hexdigest()
if hashfile not in unique:
unique.add(hashfile)
else:
os.remove(filepath)
try:
hashfun = hashlib.sha256
remove_duplicates(sys.argv[1], hashfun)
except IndexError:
print """Please pass a path to a directory with
duplicate files as a parameter to the script."""
Python has a standard library called filecmp to compare files and directories.
It checks for file size. It checks content in 8k chunks.
It works on binary files.
It does not hash.
python docs for filecmp
In order to be safe (removing them automatically can be dangerous if something goes wrong!), here is what I use, based on #zalew's answer.
Pleas also note that the md5 sum code is slightly different from #zalew's because his code generated too many wrong duplicate files (that's why I said removing them automatically is dangerous!).
import hashlib, os
unique = dict()
for filename in os.listdir('.'):
if os.path.isfile(filename):
filehash = hashlib.md5(open(filename, 'rb').read()).hexdigest()
if filehash not in unique:
unique[filehash] = filename
else:
print filename + ' is a duplicate of ' + unique[filehash]
I have found a 100% working code for removing duplicate files recursively inside a folder. Just replace the folder name in the clean method with your folder name.
import time
import os
import shutil
from hashlib import sha256
class Duplython:
def __init__(self):
self.home_dir = os.getcwd()
self.File_hashes = []
self.Cleaned_dirs = []
self.Total_bytes_saved = 0
self.block_size = 65536
self.count_cleaned = 0
def welcome(self) -> None:
print('******************************************************************')
print('**************** DUPLYTHON ****************************')
print('********************************************************************\n\n')
print('---------------- WELCOME ----------------------------')
time.sleep(3)
print('\nCleaning .................')
return None
def generate_hash(self, Filename: str) -> str:
Filehash = sha256()
try:
with open(Filename, 'rb') as File:
fileblock = File.read(self.block_size)
while len(fileblock) > 0:
Filehash.update(fileblock)
fileblock = File.read(self.block_size)
Filehash = Filehash.hexdigest()
return Filehash
except:
return False
def clean(self) -> None:
all_dirs = [path[0] for path in os.walk('E:\\songs')]
for path in all_dirs:
os.chdir(path)
All_Files = [file for file in os.listdir() if os.path.isfile(file)]
for file in All_Files:
filehash = self.generate_hash(file)
if not filehash in self.File_hashes:
if filehash:
self.File_hashes.append(filehash)
# print(file)
else:
byte_saved = os.path.getsize(file)
self.count_cleaned += 1
self.Total_bytes_saved += byte_saved
os.remove(file)
filename = file.split('/')[-1]
print(filename, '.. cleaned ')
os.chdir(self.home_dir)
def cleaning_summary(self) -> None:
mb_saved = self.Total_bytes_saved / 1048576
mb_saved = round(mb_saved, 2)
print('\n\n--------------FINISHED CLEANING ------------')
print('File cleaned : ', self.count_cleaned)
print('Total Space saved : ', mb_saved, 'MB')
print('-----------------------------------------------')
def main(self) -> None:
self.welcome()
self.clean()
self.cleaning_summary()
#
# if __name__ == '__main__':
# App = Duplython()
# App.main()
def dedupe_bing_images():
App = Duplython()
App.main()
return True
dedupe_bing_images()

Categories