How to concatenate a newly added file to pandas dataframe?

How to concatenate a newly added file to pandas dataframe? - python

I am trying to write a script which will be grabbing newly added csv file from the folder and adding it to one big file. Basically, I want all of the csv files added to a particular folder, being stored in one resulting csv file. I have a code below which generates the list of files and I am selecting the newly added file there:
def check_dir(fh,start_path='/Users/.../Desktop/files',new_cb=None,changed_cb=None):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
fs = os.path.getsize(fp)
total_size += fs
if f in fh:
if fh[f] == fs:
# file unchanged
pass
else:
if changed_cb:
changed_cb(fp)
else:
#new file
if new_cb:
new_cb(fp)
fh[f] = fs
return total_size
def new_file(fp):
print("New File {0}!".format(fp))
def changed_file(fp):
print("File {0} changed!".format(fp))
if __name__ == '__main__':
file_history={}
total = 0
while(True):
nt = check_dir(file_history,'/Users/.../Desktop/files',new_file,changed_file)
if total and nt != total:
print("Total size changed from {0} to {1}".format(total,nt))
total = nt
time.sleep(200)
print("File list:\n{0}".format(file_history))
print(list(dict.keys(file_history))[-1])
I don't really know how to create this empty pandas data frame to which this latest added file will be added on a regular basis (that's why I have a time.sleep there). In the end I want to have this big csv file with all the files added to it.
Please, help :(
P.S. I am new to Python, so please don't judge if it is super simple..

Are you going to be using Pandas to process the data in the csv or only to concatenate the files?
If you simply want to append each csv file to the big one, then why not use python io for speed and simplicity. Assuming that all csv files use the same type of formatting that is.
I have updated the new_file method to append to the big csv using io. I have added an append_pandas function which is not used but should help you if you must use pandas to do the job. I haven't tested the pandas function, there are more things to consider like the format of the csv files. Check out the documentation for more details.
import os
import time
def check_dir(fh,start_path='/Users/.../Desktop/files',new_cb=None,changed_cb=None,**kwargs):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
fs = os.path.getsize(fp)
total_size += fs
if f in fh:
if fh[f] == fs:
# file unchanged
pass
else:
if changed_cb:
changed_cb(fp,**kwargs)
else:
#new file
if new_cb:
new_cb(fp, **kwargs)
fh[f] = fs
return total_size
def is_csv(f):
# you can add more to check here
return 'csv' in f
def append_csv(s,d,skip_header=1):
with open(s,'r') as readcsv:
with open(d,'a') as appendcsv:
for line in readcsv:
if(skip_header < 1):
appendcsv.write(line)
else:
skip_header -= 1
if not "\n" in line:
appendcsv.write("\n")
def append_pandas(s,d):
# i haven't tested this
pd = pandas.read_csv(s)
pdb = pandas.read_csv(d)
newpd = pdb.append(pd)
DataFrame.to_csv(d)
def new_file(fp, **kwargs):
if is_csv(fp):
print("Appending {0}!".format(fp))
bcsv = kwargs.get('append_to_csv','/default/path/to/big.csv')
skip = kwargs.get('skip_header',1)
append_csv(fp,bcsv,skip)
def changed_file(fp, **kwargs):
print("File {0} changed!".format(fp))
if __name__ == '__main__':
file_history={}
total = 0
while(True):
nt = check_dir(file_history,'/tmp/test/',new_file,changed_file, append_to_csv ='/tmp/big.csv', skip_header = 1)
if total and ns != total:
print("Total size changed from {0} to {1}".format(total,ns))
total = ns
time.sleep(10)
print("File list:\n{0}".format(file_history))

I think that pandas.concat() is what you are looking for

Related

Compare folder size and run script if the size changed

I am trying to write a code which will carry out something with the newly added file to the folder. So, the way I see it is to calculate the folder size, compare it with the one calculated ±10 mins ago, and then to do something with the newly added file if the size of the folder did change.
while (True):
def get_size(start_path='/Users/.../Desktop/files'):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
# skip if it is symbolic link
if not os.path.islink(fp):
total_size += os.path.getsize(fp)
return total_size
print(get_size(), 'bytes')
time.sleep(10)
The code above calculates the size of the folder every 10 seconds. I don't know how to compare it to the previous size though :(
Please, help..

Tracking the total size of the directory is limiting. How about you keep a list of files and their sizes? That way you can act on changed files and new files. Using a dictionary here as a basic example, you can really make it as complicated as you wish, tracking creation, modification dates etc. If you don't want the complexity I have retained tracking of total size, however you still need to track which file(s) have changed.
import os
import time
def check_dir(fh,start_path='/tmp',new_cb=None,changed_cb=None):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
fs = os.path.getsize(fp)
total_size += fs
if f in fh:
if fh[f] == fs:
# file unchanged
pass
else:
if changed_cb:
changed_cb(fp)
else:
#new file
if new_cb:
new_cb(fp)
fh[f] = fs
return total_size
def new_file(fp):
print("New File {0}!".format(fp))
def changed_file(fp):
print("File {0} changed!".format(fp))
if __name__ == '__main__':
file_history={}
total = 0
while(True):
nt = check_dir(file_history,'/tmp/test',new_file,changed_file)
if total and nt != total:
print("Total size changed from {0} to {1}".format(total,nt))
total = nt
time.sleep(10)
print("File list:\n{0}".format(file_history))

This is a simple problem, that can be sobstituted with a minimum reproducible example this way:
while True:
a = input() # How can I check if this is different from the previous input?
Create a variable that stores the previous weight, this way:
old = get_size()
while True:
new = get_size()
if new != old:
# Something changed
old = new # You can do it since integers are immutable typed
I would add a suggestion for you:
def get_size():
...
while True:
...
This will make you avoid a waste of time and efficiency, since otherwise the get_size function gets redefined each time the while loop executes (every 10 seconds!).

You can try something like that:
before = 0
while (True):
def get_size(start_path='.'):
total_size = 0
# use variable defined outside loop
global before
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
# skip if it is symbolic link
if not os.path.islink(fp):
total_size += os.path.getsize(fp)
# Your logic
if total_size > before:
print("New: "+str(total_size - before))
else:
print("No changes")
before = total_size
return total_size
print(get_size(), 'bytes')
time.sleep(10)

Script that should remove certain txt files from a directory does not remove the files

I am trying to write a python script that iterates over all the txt files in a directory and deletes those that have fewer words than a defined limit. The current script does not delete the files. Below is what I tried:
import os
wordLimit = 1000
def articleWordCount(filename):
number = 0
with open(filename, 'w+') as f:
for line in f:
words = line.split()
number += len(words)
return number
def articlesRemoval():
directory = 'data'
removedArticles =0
for filename in os.listdir(directory):
if filename.endswith(".txt"):
if articleWordCount(filename) < wordLimit:
removedArticles += 1
os.remove(filename)
else:
print(os.path.join(directory, filename))
print(removedArticles)
articlesRemoval()

You should open the file in reading mode with the option "r+", you are opening it in write mode and the function articleWordCount always returns 0.

os.listdir() doesn't return paths, only filenames, so the files that you are trying to remove do not exist... I am assuming that data is in the directory where you are starting the script and that it does find the files you want. Change os.remove(filename) to os.remove(os.path.join(directory, filename)):
import os
wordLimit = 1000
def articleWordCount(filename):
number = 0
with open(filename, 'w+') as f:
for line in f:
words = line.split()
number += len(words)
return number
def articlesRemoval():
directory = 'data'
removedArticles = 0
for filename in os.listdir(directory):
if filename.endswith(".txt"):
if articleWordCount(filename) < wordLimit:
removedArticles += 1
os.remove(os.path.join(directory, filename))
else:
print(os.path.join(directory, filename))
print(removedArticles)
articlesRemoval()

Sum of values in a function called from thread

I have few text files containing some numbers in each line. I read the file and displayed total of each number in the function.
from Queue import Queue
from threading import Thread
import os
enclosure_queue = Queue()
list=[]
def getAllFiles():
for root, dirs, files in os.walk("C:/Users/test"):
for file in files:
if file.endswith(".txt"):
file_path= os.path.join(root, file)
list.append(file_path)
def calc(i, q):
while True:
file = q.get()
fileData = open(file, 'r')
add=0
for line in fileData:
add= add + int(line)
print str(add) + '\n'
q.task_done()
getAllFiles()
num_fetch_threads = len(list)
for i in range(num_fetch_threads):
worker = Thread(target=calc, args=(i, enclosure_queue,))
worker.setDaemon(True)
worker.start()
for ind_file in list:
enclosure_queue.put(ind_file)
enclosure_queue.join()
It displays the sum of data in lines of individual file, but I need to add up the results.
For example if the calc function's add has 300 , 200 and 500 , I want final result as 1000 . I thought of adding each result to a list and then splitting it in another function and adding them. Is there any better solution ?

No need to use Queue here. Use multiprocessing.Pool.map, and change your calc method accordingly. Also threading.Thread does not return results, whereas multiprocessing.Pool.map returns results.
import multiprocessing
import os
def getAllFiles():
my_files = list()
for root, dirs, files in os.walk("C:/Users/test"):
for file in files:
if file.endswith(".txt"):
file_path= os.path.join(root, file)
my_files.append(file_path)
return my_files
def calc(file):
with open(file, 'r') as f:
return sum(map(int, f.readlines()))
if __name__ == '__main__':
my_files = getAllFiles()
num_fetch_threads = len(my_files)
pool = multiprocessing.Pool(processes=num_fetch_threads)
results = pool.map(calc, my_files)
result = sum(results)
print result

How do I fix this file_tracker that reads/writes using JSON dictionaries?

I am trying to write a script that tracks for changes made in directories/files set to multiple file paths created by an installer. I found Thomas Sileo's DirTools project on git, modified it, but am now running into some issues when writing/reading from JSON:
1) First, I believe that I am writing to JSON incorrectly and am finding that my create_state() function is only writing the last path I need.
2) If I get it working, I am unable to read/parse the file like I was before. I usually get ValueError: Extra data errors
Code below:
import os import json import getpass
files = [] subdirs = []
USER = getpass.getuser()
pathMac = ['/Applications/',
'/Users/' + USER + '/Documents/' ]
def create_dir_index(path):
files = []
subdirs = []
for root, dirs, filenames in os.walk(path):
for subdir in dirs:
subdirs.append(os.path.relpath(os.path.join(root, subdir), path))
for f in filenames:
files.append(os.path.relpath(os.path.join(root, f), path))
return dict(files=files, subdirs=subdirs)
def create_state(): for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("Manifest.json", "w")
json.dump(dir_state, out_file)
out_file.close()
def compare_states(dir_base, dir_cmp):
'''
return a comparison two manifest json files
'''
data = {}
data['deleted'] = list(set(dir_cmp['files']) - set(dir_base['files']))
data['created'] = list(set(dir_base['files']) - set(dir_cmp['files']))
data['deleted_dirs'] = list(set(dir_cmp['subdirs']) - set(dir_base['subdirs']))
data['created_dirs'] = list(set(dir_base['subdirs']) - set(dir_cmp['subdirs']))
return data
if __name__ == '__main__':
response = raw_input("Would you like to Compare or Create? ")
if response == "Create":
# CREATE MANIFEST json file
create_state()
print "Manifest file created."
elif response == "Compare":
# create the CURRENT state of all indexes in pathMac and write to json file
for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("CurrentState.json", "w")
json.dump(dir_state, out_file)
out_file.close()
# Open and Load the contents from the file into dictionaries
manifest = json.load(open("Manifest.json", "r"))
current = json.load(open("CurrentState.json", "r"))
print compare_states(current, manifest)

Finding duplicate files and removing them

I am writing a Python program to find and remove duplicate files from a folder.
I have multiple copies of mp3 files, and some other files. I am using the sh1 algorithm.
How can I find these duplicate files and remove them?

Fastest algorithm - 100x performance increase compared to the accepted answer (really :))
The approaches in the other solutions are very cool, but they forget about an important property of duplicate files - they have the same file size. Calculating the expensive hash only on files with the same size will save tremendous amount of CPU; performance comparisons at the end, here's the explanation.
Iterating on the solid answers given by #nosklo and borrowing the idea of #Raffi to have a fast hash of just the beginning of each file, and calculating the full one only on collisions in the fast hash, here are the steps:
Buildup a hash table of the files, where the filesize is the key.
For files with the same size, create a hash table with the hash of their first 1024 bytes; non-colliding elements are unique
For files with the same hash on the first 1k bytes, calculate the hash on the full contents - files with matching ones are NOT unique.
The code:
#!/usr/bin/env python3
from collections import defaultdict
import hashlib
import os
import sys
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
hashobj = hash()
file_object = open(filename, 'rb')
if first_chunk_only:
hashobj.update(file_object.read(1024))
else:
for chunk in chunk_reader(file_object):
hashobj.update(chunk)
hashed = hashobj.digest()
file_object.close()
return hashed
def check_for_duplicates(paths, hash=hashlib.sha1):
hashes_by_size = defaultdict(list) # dict of size_in_bytes: [full_path_to_file1, full_path_to_file2, ]
hashes_on_1k = defaultdict(list) # dict of (hash1k, size_in_bytes): [full_path_to_file1, full_path_to_file2, ]
hashes_full = {} # dict of full_file_hash: full_path_to_file_string
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
# get all files that have the same size - they are the collision candidates
for filename in filenames:
full_path = os.path.join(dirpath, filename)
try:
# if the target is a symlink (soft one), this will
# dereference it - change the value to the actual target file
full_path = os.path.realpath(full_path)
file_size = os.path.getsize(full_path)
hashes_by_size[file_size].append(full_path)
except (OSError,):
# not accessible (permissions, etc) - pass on
continue
# For all files with the same file size, get their hash on the 1st 1024 bytes only
for size_in_bytes, files in hashes_by_size.items():
if len(files) < 2:
continue # this file size is unique, no need to spend CPU cycles on it
for filename in files:
try:
small_hash = get_hash(filename, first_chunk_only=True)
# the key is the hash on the first 1024 bytes plus the size - to
# avoid collisions on equal hashes in the first part of the file
# credits to #Futal for the optimization
hashes_on_1k[(small_hash, size_in_bytes)].append(filename)
except (OSError,):
# the file access might've changed till the exec point got here
continue
# For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates
for __, files_list in hashes_on_1k.items():
if len(files_list) < 2:
continue # this hash of fist 1k file bytes is unique, no need to spend cpy cycles on it
for filename in files_list:
try:
full_hash = get_hash(filename, first_chunk_only=False)
duplicate = hashes_full.get(full_hash)
if duplicate:
print("Duplicate found: {} and {}".format(filename, duplicate))
else:
hashes_full[full_hash] = filename
except (OSError,):
# the file access might've changed till the exec point got here
continue
if __name__ == "__main__":
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print("Please pass the paths to check as parameters to the script")
And, here's the fun part - performance comparisons.
Baseline -
a directory with 1047 files, 32 mp4, 1015 - jpg, total size - 5445.998 MiB - i.e. my phone's camera auto upload directory :)
small (but fully functional) processor - 1600 BogoMIPS, 1.2 GHz 32L1 + 256L2 Kbs cache, /proc/cpuinfo:
Processor : Feroceon 88FR131 rev 1 (v5l)
BogoMIPS : 1599.07
(i.e. my low-end NAS :), running Python 2.7.11.
So, the output of #nosklo's very handy solution:
root#NAS:InstantUpload# time ~/scripts/checkDuplicates.py
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg
real 5m44.198s
user 4m44.550s
sys 0m33.530s
And, here's the version with filter on size check, then small hashes, and finally full hash if collisions are found:
root#NAS:InstantUpload# time ~/scripts/checkDuplicatesSmallHash.py . "/i-data/51608399/photo/Todor phone"
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg
real 0m1.398s
user 0m1.200s
sys 0m0.080s
Both versions were ran 3 times each, to get the avg of the time needed.
So v1 is (user+sys) 284s, the other - 2s; quite a diff, huh :)
With this increase, one could go to SHA512, or even fancier - the perf penalty will be mitigated by the less calculations needed.
Negatives:
More disk access than the other versions - every file is accessed once for size stats (that's cheap, but still is disk IO), and every duplicate is opened twice (for the small first 1k bytes hash, and for the full contents hash)
Will consume more memory due to storing the hash tables runtime

Recursive folders version:
This version uses the file size and a hash of the contents to find duplicates.
You can pass it multiple paths, it will scan all paths recursively and report all duplicates found.
import sys
import os
import hashlib
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def check_for_duplicates(paths, hash=hashlib.sha1):
hashes = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
hashobj = hash()
for chunk in chunk_reader(open(full_path, 'rb')):
hashobj.update(chunk)
file_id = (hashobj.digest(), os.path.getsize(full_path))
duplicate = hashes.get(file_id, None)
if duplicate:
print "Duplicate found: %s and %s" % (full_path, duplicate)
else:
hashes[file_id] = full_path
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print "Please pass the paths to check as parameters to the script"

def remove_duplicates(dir):
unique = []
for filename in os.listdir(dir):
if os.path.isfile(filename):
filehash = md5.md5(file(filename).read()).hexdigest()
if filehash not in unique:
unique.append(filehash)
else:
os.remove(filename)
//edit:
For MP3 you may be also interested in this topic Detect duplicate MP3 files with different bitrates and/or different ID3 tags?

I wrote one in Python some time ago -- you're welcome to use it.
import sys
import os
import hashlib
check_path = (lambda filepath, hashes, p = sys.stdout.write:
(lambda hash = hashlib.sha1 (file (filepath).read ()).hexdigest ():
((hash in hashes) and (p ('DUPLICATE FILE\n'
' %s\n'
'of %s\n' % (filepath, hashes[hash])))
or hashes.setdefault (hash, filepath)))())
scan = (lambda dirpath, hashes = {}:
map (lambda (root, dirs, files):
map (lambda filename: check_path (os.path.join (root, filename), hashes), files), os.walk (dirpath)))
((len (sys.argv) > 1) and scan (sys.argv[1]))

Faster algorithm
In case many files of 'big size' should be analyzed (images, mp3, pdf documents), it would be interesting/faster to have the following comparison algorithm:
a first fast hash is performed on the first N bytes of the file (say 1KB). This hash would say if files are different without doubt, but will not say if two files are exactly the same (accuracy of the hash, limited data read from disk)
a second, slower, hash, which is more accurate and performed on the whole content of the file, if a collision occurs in the first stage
Here is an implementation of this algorithm:
import hashlib
def Checksum(current_file_name, check_type = 'sha512', first_block = False):
"""Computes the hash for the given file. If first_block is True,
only the first block of size size_block is hashed."""
size_block = 1024 * 1024 # The first N bytes (1KB)
d = {'sha1' : hashlib.sha1, 'md5': hashlib.md5, 'sha512': hashlib.sha512}
if(not d.has_key(check_type)):
raise Exception("Unknown checksum method")
file_size = os.stat(current_file_name)[stat.ST_SIZE]
with file(current_file_name, 'rb') as f:
key = d[check_type].__call__()
while True:
s = f.read(size_block)
key.update(s)
file_size -= size_block
if(len(s) < size_block or first_block):
break
return key.hexdigest().upper()
def find_duplicates(files):
"""Find duplicates among a set of files.
The implementation uses two types of hashes:
- A small and fast one one the first block of the file (first 1KB),
- and in case of collision a complete hash on the file. The complete hash
is not computed twice.
It flushes the files that seems to have the same content
(according to the hash method) at the end.
"""
print 'Analyzing', len(files), 'files'
# this dictionary will receive small hashes
d = {}
# this dictionary will receive full hashes. It is filled
# only in case of collision on the small hash (contains at least two
# elements)
duplicates = {}
for f in files:
# small hash to be fast
check = Checksum(f, first_block = True, check_type = 'sha1')
if(not d.has_key(check)):
# d[check] is a list of files that have the same small hash
d[check] = [(f, None)]
else:
l = d[check]
l.append((f, None))
for index, (ff, checkfull) in enumerate(l):
if(checkfull is None):
# computes the full hash in case of collision
checkfull = Checksum(ff, first_block = False)
l[index] = (ff, checkfull)
# for each new full hash computed, check if their is
# a collision in the duplicate dictionary.
if(not duplicates.has_key(checkfull)):
duplicates[checkfull] = [ff]
else:
duplicates[checkfull].append(ff)
# prints the detected duplicates
if(len(duplicates) != 0):
print
print "The following files have the same sha512 hash"
for h, lf in duplicates.items():
if(len(lf)==1):
continue
print 'Hash value', h
for f in lf:
print '\t', f.encode('unicode_escape') if \
type(f) is types.UnicodeType else f
return duplicates
The find_duplicates function takes a list of files. This way, it is also possible to compare two directories (for instance, to better synchronize their content.) An example of function creating a list of files, with specified extension, and avoiding entering in some directories, is below:
def getFiles(_path, extensions = ['.png'],
subdirs = False, avoid_directories = None):
"""Returns the list of files in the path :'_path',
of extension in 'extensions'. 'subdir' indicates if
the search should also be performed in the subdirectories.
If extensions = [] or None, all files are returned.
avoid_directories: if set, do not parse subdirectories that
match any element of avoid_directories."""
l = []
extensions = [p.lower() for p in extensions] if not extensions is None \
else None
for root, dirs, files in os.walk(_path, topdown=True):
for name in files:
if(extensions is None or len(extensions) == 0 or \
os.path.splitext(name)[1].lower() in extensions):
l.append(os.path.join(root, name))
if(not subdirs):
while(len(dirs) > 0):
dirs.pop()
elif(not avoid_directories is None):
for d in avoid_directories:
if(d in dirs): dirs.remove(d)
return l
This method is convenient for not parsing .svn paths for instance, which surely will trigger colliding files in find_duplicates.
Feedbacks are welcome.

#IanLee1521 has a nice solution here. It is very efficient because it checks the duplicate based on the file size first.
#! /usr/bin/env python
# Originally taken from:
# http://www.pythoncentral.io/finding-duplicate-files-with-python/
# Original Auther: Andres Torres
# Adapted to only compute the md5sum of files with the same size
import argparse
import os
import sys
import hashlib
def find_duplicates(folders):
"""
Takes in an iterable of folders and prints & returns the duplicate files
"""
dup_size = {}
for i in folders:
# Iterate the folders given
if os.path.exists(i):
# Find the duplicated files and append them to dup_size
join_dicts(dup_size, find_duplicate_size(i))
else:
print('%s is not a valid path, please verify' % i)
return {}
print('Comparing files with the same size...')
dups = {}
for dup_list in dup_size.values():
if len(dup_list) > 1:
join_dicts(dups, find_duplicate_hash(dup_list))
print_results(dups)
return dups
def find_duplicate_size(parent_dir):
# Dups in format {hash:[names]}
dups = {}
for dirName, subdirs, fileList in os.walk(parent_dir):
print('Scanning %s...' % dirName)
for filename in fileList:
# Get the path to the file
path = os.path.join(dirName, filename)
# Check to make sure the path is valid.
if not os.path.exists(path):
continue
# Calculate sizes
file_size = os.path.getsize(path)
# Add or append the file path
if file_size in dups:
dups[file_size].append(path)
else:
dups[file_size] = [path]
return dups
def find_duplicate_hash(file_list):
print('Comparing: ')
for filename in file_list:
print(' {}'.format(filename))
dups = {}
for path in file_list:
file_hash = hashfile(path)
if file_hash in dups:
dups[file_hash].append(path)
else:
dups[file_hash] = [path]
return dups
# Joins two dictionaries
def join_dicts(dict1, dict2):
for key in dict2.keys():
if key in dict1:
dict1[key] = dict1[key] + dict2[key]
else:
dict1[key] = dict2[key]
def hashfile(path, blocksize=65536):
afile = open(path, 'rb')
hasher = hashlib.md5()
buf = afile.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(blocksize)
afile.close()
return hasher.hexdigest()
def print_results(dict1):
results = list(filter(lambda x: len(x) > 1, dict1.values()))
if len(results) > 0:
print('Duplicates Found:')
print(
'The following files are identical. The name could differ, but the'
' content is identical'
)
print('___________________')
for result in results:
for subresult in result:
print('\t\t%s' % subresult)
print('___________________')
else:
print('No duplicate files found.')
def main():
parser = argparse.ArgumentParser(description='Find duplicate files')
parser.add_argument(
'folders', metavar='dir', type=str, nargs='+',
help='A directory to parse for duplicates',
)
args = parser.parse_args()
find_duplicates(args.folders)
if __name__ == '__main__':
sys.exit(main())

import hashlib
import os
import sys
from sets import Set
def read_chunk(fobj, chunk_size = 2048):
""" Files can be huge so read them in chunks of bytes. """
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def remove_duplicates(dir, hashfun = hashlib.sha512):
unique = Set()
for filename in os.listdir(dir):
filepath = os.path.join(dir, filename)
if os.path.isfile(filepath):
hashobj = hashfun()
for chunk in read_chunk(open(filepath,'rb')):
hashobj.update(chunk)
# the size of the hashobj is constant
# print "hashfun: ", hashfun.__sizeof__()
hashfile = hashobj.hexdigest()
if hashfile not in unique:
unique.add(hashfile)
else:
os.remove(filepath)
try:
hashfun = hashlib.sha256
remove_duplicates(sys.argv[1], hashfun)
except IndexError:
print """Please pass a path to a directory with
duplicate files as a parameter to the script."""

Python has a standard library called filecmp to compare files and directories.
It checks for file size. It checks content in 8k chunks.
It works on binary files.
It does not hash.
python docs for filecmp

In order to be safe (removing them automatically can be dangerous if something goes wrong!), here is what I use, based on #zalew's answer.
Pleas also note that the md5 sum code is slightly different from #zalew's because his code generated too many wrong duplicate files (that's why I said removing them automatically is dangerous!).
import hashlib, os
unique = dict()
for filename in os.listdir('.'):
if os.path.isfile(filename):
filehash = hashlib.md5(open(filename, 'rb').read()).hexdigest()
if filehash not in unique:
unique[filehash] = filename
else:
print filename + ' is a duplicate of ' + unique[filehash]

I have found a 100% working code for removing duplicate files recursively inside a folder. Just replace the folder name in the clean method with your folder name.
import time
import os
import shutil
from hashlib import sha256
class Duplython:
def __init__(self):
self.home_dir = os.getcwd()
self.File_hashes = []
self.Cleaned_dirs = []
self.Total_bytes_saved = 0
self.block_size = 65536
self.count_cleaned = 0
def welcome(self) -> None:
print('******************************************************************')
print('**************** DUPLYTHON ****************************')
print('********************************************************************\n\n')
print('---------------- WELCOME ----------------------------')
time.sleep(3)
print('\nCleaning .................')
return None
def generate_hash(self, Filename: str) -> str:
Filehash = sha256()
try:
with open(Filename, 'rb') as File:
fileblock = File.read(self.block_size)
while len(fileblock) > 0:
Filehash.update(fileblock)
fileblock = File.read(self.block_size)
Filehash = Filehash.hexdigest()
return Filehash
except:
return False
def clean(self) -> None:
all_dirs = [path[0] for path in os.walk('E:\\songs')]
for path in all_dirs:
os.chdir(path)
All_Files = [file for file in os.listdir() if os.path.isfile(file)]
for file in All_Files:
filehash = self.generate_hash(file)
if not filehash in self.File_hashes:
if filehash:
self.File_hashes.append(filehash)
# print(file)
else:
byte_saved = os.path.getsize(file)
self.count_cleaned += 1
self.Total_bytes_saved += byte_saved
os.remove(file)
filename = file.split('/')[-1]
print(filename, '.. cleaned ')
os.chdir(self.home_dir)
def cleaning_summary(self) -> None:
mb_saved = self.Total_bytes_saved / 1048576
mb_saved = round(mb_saved, 2)
print('\n\n--------------FINISHED CLEANING ------------')
print('File cleaned : ', self.count_cleaned)
print('Total Space saved : ', mb_saved, 'MB')
print('-----------------------------------------------')
def main(self) -> None:
self.welcome()
self.clean()
self.cleaning_summary()
#
# if __name__ == '__main__':
# App = Duplython()
# App.main()
def dedupe_bing_images():
App = Duplython()
App.main()
return True
dedupe_bing_images()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to concatenate a newly added file to pandas dataframe? - python

I think that pandas.concat() is what you are looking for

Related

Compare folder size and run script if the size changed

Script that should remove certain txt files from a directory does not remove the files

Sum of values in a function called from thread

How do I fix this file_tracker that reads/writes using JSON dictionaries?

Finding duplicate files and removing them

Categories

Resources