count duplicate Files

count duplicate Files - python

I'm new to Python and want to count contents in 60k text files which are the same, and list all the different contents with a number of how many are the same, something like uniq -c but on a file, rather than line, level.
So far, I have:
from os import listdir
from os.path import isfile, join
mypath = "C:\Users\daniel.schneider\Downloads\Support" # my Filepath
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
for currentFile in onlyfiles:
currentPath = mypath + '\\' + currentFile
f = open(currentPath)
print currentPath
for currentLine in currentFile:
print currentLine[24:]
f.close()
break

I haven't tested it thoroughly, but you could use Python's hashlib to get a MD5 hash on each file, and store the filenames in a list associated with each hash in a dictionary.
Then, to get the unique content with a count of how many files it appears in, iterate over the dictionary:
import os
import hashlib
mypath = 'testdup'
onlyfiles = [f for f in os.listdir(mypath)
if os.path.isfile(os.path.join(mypath,f)) ]
files = {}
for filename in onlyfiles:
filehash = hashlib.md5(open(os.path.join(mypath, filename), 'rb')
.read()).hexdigest()
try:
files[filehash].append(filename)
except KeyError:
files[filehash] = [filename]
for filehash, filenames in files.items():
print('{0} files have this content:'.format(len(filenames)))
print(open(os.path.join(mypath,filenames[0])).read())

Related

How to get the filename in directory with the max number in the filename python?

I have some xml files in a folder as example 'assests/2020/2010.xml', 'assests/2020/20005.xml', 'assests/2020/20999.xml' etc. I want to get the filename with max value in the '2020' folder. For above three files output should be 20999.xml
I am trying as following:
import glob
import os
list_of_files = glob.glob('assets/2020/*')
# latest_file = max(list_of_files, key=os.path.getctime)
# print (latest_file)
I couldn't be able to find logic to get the required file.
Here is the resource that have best answer to my query but I couldn't build my logic.

You can use pathlib to glob for the xml files and access the Path object attributes like .name and .stem:
from pathlib import Path
list_of_files = Path('assets/2020/').glob('*.xml')
print(max((Path(fn).name for fn in list_of_files), key=lambda fn: int(Path(fn).stem)))
Output:
20999.xml

I can't test it out right now, but you may try this:
files = []
for filename in list_of_files:
filename = str(filename)
filename = filename.replace('.xml','') #Assuming it's not printing your complete directory path
filename = int(filename)
files += [filename]
print(files)
This should get you your filenames in integer format and now you should be able to sort them in descending order and get the first item of the sorted list.

Use re to search for the appropriate endings in your file paths. If found use re again to extract the nr.
import re
list_of_files = [
'assests/2020/2010.xml',
'assests/2020/20005.xml',
'assests/2020/20999.xml'
]
highest_nr = -1
highest_nr_file = ''
for f in list_of_files:
re_result = re.findall(r'\d+\.xml$', f)
if re_result:
nr = int(re.findall(r'\d+', re_result[0])[0])
if nr > highest_nr:
highest_nr = nr
highest_nr_file = f
print(highest_nr_file)
Result
assests/2020/20999.xml

You can also try this way.
import os, re
path = "assests/2020/"
files =[
"assests/2020/2010.xml",
"assests/2020/20005.xml",
"assests/2020/20999.xml"
]
n = [int(re.findall(r'\d+\.xml$',file)[0].split('.')[0]) for file in files]
output = str(max(n))+".xml"
print("Biggest max file name of .xml file is ",os.path.join(path,output))
Output:
Biggest max file name of .xml file is assests/2020/20999.xml

import glob
xmlFiles = []
# this will store all the xml files in your directory
for file in glob.glob("*.xml"):
xmlFiles.append(file[:4])
# this will print the maximum one
print(max(xmlFiles))

Generating MD5 hashes for all files in the current folder including all files in subfolders

with the help of this thread
https://codereview.stackexchange.com/questions/147056/short-script-to-hash-files-in-a-directory
I managed to get almost exactly what i needed. The given code is
from os import listdir, getcwd
from os.path import isfile, join, normpath, basename
import hashlib
def get_files():
current_path = normpath(getcwd())
return [join(current_path, f) for f in listdir(current_path) if isfile(join(current_path, f))]
def get_hashes():
files = get_files()
list_of_hashes = []
for each_file in files:
hash_md5 = hashlib.md5()
with open(each_file, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
list_of_hashes.append('Filename: {}\tHash: {}\n'.format(basename(each_file), hash_md5.hexdigest()))
return list_of_hashes
def write_hashes():
hashes = get_hashes()
with open('list_of_hashes.txt', 'w') as f:
for md5_hash in hashes:
f.write(md5_hash)
if __name__ == '__main__':
write_hashes()
However, additionally i'd like to consider all the files that are in subfolders of my given path and include them into the output. I tried using os.walk() but i didn't manage to succeed.
Can you help me adjusting the function get_files() such that it generates the MD5 hashes for all files in subfolders (i.e. considers the entire folder structure?)
Thanks for any help!

Try this:
current_path = normpath(getcwd())
listOfFiles = []
for (dirpath, dirnames, filenames) in walk(current_path):
listOfFiles += [join(dirpath, file) for file in filenames]
(based on this source)

How to extract file name from folder to variable and not list?

I want to extract file names from a folder and be able to use the names as strings:
I can get the file names from the folder but they are going into a list. Ideally I do not want to iterate over the list. Code so far:
from os import listdir
from os.path import isfile, join
f = '/path/to/file/*.sql'
for files in glob.glob(f):
with open(files, 'r',encoding = "utf-8", errors="replace") as f:
path = "/path/to/file/folder"
onlyTxtFiles = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith(".sql")]
not sure how to put the string of the file name into variable 'onlyTxtFiles'.
edit;
I want to be able to do this:
onlyTxtFiles = 'file name'
str1 = onlyTxtFiles + 'test'
edit:
This is a for loop I made, but I am not looping over the files:
f = '/path/to/file/sql_files/*.sql'
for files in glob.glob(f):
with open(files, 'r',encoding = "utf-8", errors="replace") as f:
path = "/path/to/file/sql_files/"
onlySqlFiles = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith(".sql")]
for x in onlySqlFiles:
file_name_insert = x
what am I doing wrong?

you say you want the code to be something like
onlyTxtFiles = 'file name'
str1 = onlyTxtFiles + 'test'
but if onlyTxtFiles is a list then it will return an error
one way would be to
onlyTxtFiles = 'file name'
str1 = onlyTxtFiles[0] + 'test'
for each expected value
or you could just use a for-loop

How to output in different directory?

I have this:
from os import path
base_path = "C:\\texts\\*.txt"
for file in files:
with open (file) as in_file, open(path.join(base_path,"%s_tokenized.txt" % file), "w") as out_file:
data = in_file.readlines()
for line in data:
words = line.split()
str1 = ','.join(words)
out_file.write(str1)
out_file.write("\n")
It produced tokenized files in the same directory it reads from. How can I output those out_files in different directory such as "C:\\texts\\Tokenized" ?
I know there are some ways to move those new files to other directory after producing them, but what I wanna know is that if there is anyway to output new files to other directory at the same time they are produced in above code?

Is this what you're looking for:
import os
import glob
source_pattern = 'c:/texts/*.txt'
output_directory = 'c:/texts/tokenized'
# Iterate over files matching source_pattern
for input_file in glob.glob(source_pattern):
# build the output filename
base,ext = os.path.splitext(os.path.basename(input_file))
output_file = os.path.join(output_directory,base + '_tokenized' + ext)
with open(input_file) as in_file, open(output_file,'w') as out_file:
for line in in_file:
out_file.write(','.join(line.split()) + '\n')

This is how I output to files in arbitrary directories :
dir_name = "../some_dir"
if not os.path.exists(dir_name) : os.makedirs(dir_name)
out_file_name = dir_name + '/out.txt'
out_file = open( out_file_name, 'w')
EDIT :
file_name = "{0}_tokenized.txt".format(something_from_tokenizing)
if not os.path.exists(dir_name) : os.makedirs(dir_name)
out_file_name = dir_name + file_name
EDIT :
I just tried it, worked for me. You simply need two paths, one for the source directory and one for the destination. Hope this helps.
import os
from os import path
f1 = open("in.txt")
f2 = open("out.txt")
files = ["in.txt", "out.txt"]
base_path = "."
dir_name = "./some_dir"
if not os.path.exists(dir_name) : os.makedirs(dir_name)
for file in files:
with open (file) as in_file, open(path.join(dir_name,"%s_tokenized.txt" % file), "w") as out_file:
data = in_file.readlines()
for line in data:
words = line.split()
str1 = ','.join(words)
out_file.write(str1)
out_file.write("\n")

how to get list of files from a directory into text file using python

This question is how to get list of files from a directory into text file using python.
Result in the text file should exactly be like this:
E:\AA\a.jpg
E:\AA\b.jpg
...
How to correct the code below:
WD = "E:\\AA"
import glob
files = glob.glob ('*.jpg')
with open ('infiles.txt', 'w') as in_files:
in_files.write(files +'\n')

glob.glob() returns a list. You have to iterate through it.
WD = "E:\\AA"
import glob
files = glob.glob ('*.jpg')
with open ('infiles.txt', 'w') as in_files:
for eachfile in files: in_files.write(eachfile+'\n')

Input directory path : WD = "E://AA"
You can assign specific file extention that you needed eg: path = WD+'/*.jpg',
if you need all file list then give '' eg: path = WD+'/'
import glob
w_dir = WD + "/*.jpg"
with open("infiles.txt","wb")as fp:
for path in [filepath for filepath in glob.glob(w_dir)]:
fp.write(path+"\n")

Without path, glob.glob returns list of filename (No directory part). To get full path you need to call os.path.abspath(filename) / os.path.realpath(filename) / os.path.join(WD, filename)
>>> glob.glob('*.png')
['gnome-html.png', 'gnome-windows.png', 'gnome-set-time.png', ...]
>>> os.path.abspath('gnome-html.png')
'/usr/share/pixmaps/gnome-html.png'
With path, glob.glob return list of filename with directory part.
>>> glob.glob('/usr/share/pixmaps/*.png')
['/usr/share/pixmaps/gnome-html.png', '/usr/share/pixmaps/gnome-windows.png', '/usr/share/pixmaps/gnome-set-time.png', ...]
import glob
import os
WD = r'E:\AA'
files = glob.glob(os.path.join(WD, '*.jpg'))
with open('infiles.txt', 'w') as in_files:
in_files.writelines(fn + '\n' for fn in files)
or
import glob
import os
WD = r'E:\AA'
os.chdir(WD)
files = glob.glob('*.jpg')
with open('infiles.txt', 'w') as in_files:
in_files.writelines(os.path.join(WD, fn) + '\n' for fn in files)

Here is a two line simple solution:
import os
filee = open('all_names.txt','w')
given_dir = 'the_dierctory'
[filee.write(os.path.join(os.path.dirname(os.path.abspath(__file__)),given_dir,i)+'\n') for i in os.listdir(given_dir)]
where given_dir is the directory name. The output is a text file (all_names.txt) where each line in the file is the full path to all files and directories in the given_dir.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

count duplicate Files - python

Related

How to get the filename in directory with the max number in the filename python?

Generating MD5 hashes for all files in the current folder including all files in subfolders

How to extract file name from folder to variable and not list?

How to output in different directory?

how to get list of files from a directory into text file using python

Categories

Resources