how to crop images from their annotations XML format? - python

I have one folder which contains images and their annotations of bounding boxes in XML format. I have tried this script, but there is no result and no errors. can someone help me to solve this and thank you. the rest of my code in the comment..
original_file = r"C:\Users\probook\Downloads\Compressed\crop\train"
dst = r"C:\Users\probook\Downloads\Compressed\crop\save"
def check_folder_exists(path):
if not os.path.exists(path):
try:
os.makedirs(path)
print('create ' + path)
except OSError as e:
if e.errno != errno.EEXIST:
raise
seed_arr = []
for xml_file in glob.glob('train/*.xml'):
root = ET.parse(xml_file).getroot()
filename = root.find('filename').text
for type_tag in root.findall('size'):
#file_name = type_tag.find('filename').text
width = type_tag.find('width').text
height = type_tag.find('height').text
for type_tag in root.findall('object'):
class_name = type_tag.find('name').text
xmin = type_tag.find('bndbox/xmin').text
ymin = type_tag.find('bndbox/ymin').text
xmax = type_tag.find('bndbox/xmax').text
ymax = type_tag.find('bndbox/ymax').text
all_list = [filename, width, height,
class_name, xmin, ymin, xmax, ymax]
seed_arr.append(all_list)
seed_arr.sort()
for index, line in enumerate(seed_arr):
filename = line[0]
width = line[1]
height = line[2]
class_name = line[3]
xmin = line[4]
ymin = line[5]
xmax = line[6]
ymax = line[7]
load_img_path = os.path.join(original_file, filename)
save_class_path = os.path.join(dst, class_name)
check_folder_exists(save_class_path)
save_img_path = os.path.join(save_class_path,
str(index)+'_'+filename)
img = Image.open(load_img_path)
crop_img = img.crop((int(xmin), int(ymin), int(xmax), int(ymax)))
im1 = crop_img.resize(64 , 64)
im1.save(save_img_path, 'JPEG')
print('save ' + save_img_path)

You can use this script instead:
cut ROI from xml annotations
Clone the repo.
Install xmltodict using conda install -c conda-forge xmltodict
Copy the images and corresponding xml annotations into the data folder (inside the cloned repo directory).
Launch jupyter notebook and run the main_classification.ipynb file.

Related

How to convert XML Files into Text files (YOLOV3 Format) for object detection

How we convert XML annotation folder into text or YOLOv3 Format for detection??? I used this code for conversion but it only take one xml image and convert into .txt ..but i want to convert my full folder at once time. You have Any easy solution to convert xml files into text files. i Have 15000+ images.
from xml.dom import minidom
import os
import glob
lut={}
lut["14111"] =0
lut["14131"] =1
lut["14141"] =2
def convert_coordinates(size, box):
dw = 1.0/size[0]
dh = 1.0/size[1]
x = (box[0]+box[1])/2.0
y = (box[2]+box[3])/2.0
w = box[1]-box[0]
h = box[3]-box[2]
x = x*dw
w = w*dw
y = y*dh
h = h*dh
return (x,y,w,h)
def convert_xml2yolo( lut ):
for fname in glob.glob("/content/gdrive/MyDrive/Dataset /Annotation/14111_00000002.xml"):
xmldoc = minidom.parse(fname)
fname_out = (fname[:-4]+'.txt')
with open(fname_out, "w") as f:
itemlist = xmldoc.getElementsByTagName('object')
size = xmldoc.getElementsByTagName('size')[0]
width = int((size.getElementsByTagName('width')[0]).firstChild.data)
height = int((size.getElementsByTagName('height')[0]).firstChild.data)
for item in itemlist:
# get class label
classid = (item.getElementsByTagName('name')[0]).firstChild.data
if classid in lut:
label_str = str(lut[classid])
else:
label_str = "-1"
print ("warning: label '%s' not in look-up table" % classid)
# get bbox coordinates
xmin = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('xmin')[0]).firstChild.data
ymin = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('ymin')[0]).firstChild.data
xmax = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('xmax')[0]).firstChild.data
ymax = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('ymax')[0]).firstChild.data
b = (float(xmin), float(xmax), float(ymin), float(ymax))
bb = convert_coordinates((width,height), b)
#print(bb)
f.write(label_str + " " + " ".join([("%.6f" % a) for a in bb]) + '\n')
print ("wrote %s" % fname_out)
def main():
convert_xml2yolo( lut )
if __name__ == '__main__':
main()
Follow this github repository.
You just need to edit this line:
for fname in glob.glob("/content/gdrive/MyDrive/Dataset/Annotation/*.xml"):
This means you will read all the .xml files in the Annotation folder and convert them to .txt files.

AttributeError: 'bytes' object has no attribute 'read'

I am working on this face recognition system.I have a folder with subfolders that has face images inside it. I am trying to loop through all the subfolders that consists of images and use my 'align_face' function that detects the face and crops and aligns all the images in subfolders. It then has to save all the aligned and cropped in another folder
I have tried this:
def align_face(imagePath):
image = face_recognition.load_image_file(imagePath)
face_locations = face_recognition.face_locations(image)
face_landmarks = face_recognition.face_landmarks(image)
if len(face_locations) == 0:
print("Couldn't detect face for pid {} in path {}".format(Id,imagePath))
return []
if len(face_locations) > 1:
return []
else:
(top, right, bottom, left) = face_locations[0]
desiredWidth = (right - left)
desiredHeight = (bottom - top)
leftEyePts = face_landmarks[0]['left_eye']
rightEyePts = face_landmarks[0]['right_eye']
if len(leftEyePts) == 0 or len(rightEyePts) == 0:
print("Couldn't detect both eyes for pid {} in path {}".format(Id,imagePath))
return []
else:
leftEyeCenter = np.array(leftEyePts).mean(axis=0).astype("int")
rightEyeCenter = np.array(rightEyePts).mean(axis=0).astype("int")
leftEyeCenter = (leftEyeCenter[0],leftEyeCenter[1])
rightEyeCenter = (rightEyeCenter[0],rightEyeCenter[1])
dY = rightEyeCenter[1] - leftEyeCenter[1]
dX = rightEyeCenter[0] - leftEyeCenter[0]
angle = np.degrees(np.arctan2(dY, dX))
desiredLeftEye=(0.35, 0.35)
desiredFaceWidth = desiredWidth
desiredFaceHeight = desiredHeight
desiredRightEyeX = 1.0 - desiredLeftEye[0]
dist = np.sqrt((dX ** 2) + (dY ** 2))
desiredDist = (desiredRightEyeX - desiredLeftEye[0])
desiredDist *= desiredFaceWidth
scale = desiredDist / dist
eyesCenter = ((leftEyeCenter[0] + rightEyeCenter[0]) // 2,
(leftEyeCenter[1] + rightEyeCenter[1]) // 2)
M = cv2.getRotationMatrix2D(eyesCenter, angle, scale)
tX = desiredFaceWidth * 0.5
tY = desiredFaceHeight * desiredLeftEye[1]
M[0, 2] += (tX - eyesCenter[0])
M[1, 2] += (tY - eyesCenter[1])
(w, h) = (desiredFaceWidth, desiredFaceHeight)
output = cv2.warpAffine(image, M, (w, h),flags=cv2.INTER_CUBIC)
output = cv2.cvtColor(output, cv2.COLOR_BGR2RGB)
print("images aligned")
return output
#Now that we have defined the face alignmet and cropping, we walk through each subfolder and use the align function
for root, dirs, files in os.walk('<path to subdirectories that has face pictures>'):
for fname in files:
fpath = os.path.join(root, fname)
with open(fpath, 'rb') as f, open('<path to new folder to store cropped and aligned picture>', 'w') as newfile:
data = f.read()
new_data = align_face(data) #Implementing align_face function
newfile.write(new_data)
However, I keep getting an error.
AttributeError: 'bytes' object has no attribute 'read'
Does anyone know why?Any help would be appreciated. Thank you.
Full error is this:
The argument to align_face is the name of the file containing the image data, not the image data. So you don't need to open and read the data in your loop.
for root, dirs, files in os.walk('<path to subdirectories that has face pictures>'):
for fname in files:
fpath = os.path.join(root, fname)
with open('<path to new folder to store cropped and aligned picture>', 'w') as newfile:
new_data = align_face(fpath) #Implementing align_face function
newfile.write(new_data)

How can I convert from JSON to png in a dataset labeled with labelbox?

I have a JSON file with the next structure:
json
{'featureId': 'ckek0ugf2061y0ybwgunbdrt5',
'schemaId': 'ckek0jkvp081j0yaec2ap9a3w',
'title': 'Tree',
'value': 'tree',
'color': '#FFFF00',
'instanceURI': 'https://api.labelbox.com/masks/feature/ckek0ugf2061y0ybwgunbdrt5?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...'}
InstanceURI is one tree that I segmented from the original image used Labelbox (https://labelbox.com/). I am using PSPNet-cityscapes. That model requires a mask to the validation stage in png format. Some images have several trees (several instances URIs).
How can I convert this JSON element in a png image?
Not the fastest and most beautiful script - but it works for me...
from PIL import Image, ImageColor, ImageDraw
from PIL import UnidentifiedImageError
import requests
import json
import argparse
import pathlib
import os.path
def manual_classes():
"""
Change your preferenced color-coding below.
If you want to use manual coloring, you also need to change the Label-Classes (Title)
"""
manual_dict = {
'Tree': 255,
'Flower': 85,
}
return manual_dict
def open_img(url):
try:
return Image.open(requests.get(url, stream=True).raw)
except UnidentifiedImageError:
return None
def open_json(path):
with open(path) as file:
return json.load(file)
def color_extractor(data, color_coding):
"""takes the given dictionary part and extracts all needed information. returns also colors for 3 different types"""
if color_coding == 'auto':
color = ImageColor.getcolor(data['color'], 'RGBA')
elif color_coding == 'manual':
color = (manual_classes()[data['title']],manual_classes()[data['title']],manual_classes()[data['title']],255)
elif color_coding == 'binar':
color = (255,255,255,255)
else:
print('no valid color-code detected - continue with binarized Labels.')
color = (255,255,255,255)
return color
def img_color(img, color):
"""change color of label accordingly"""
if color == (255,255,255,255):
return img
img = img.convert('RGBA')
width, height = img.size
for x in range(width):
for y in range(height):
if img.getpixel((x,y)) == (255,255,255,255):
img.putpixel((x,y), color)
return img
def img_draw_polygon(size, polygon, color):
"""draw polygons on image"""
img = Image.new('RGBA', size, (0,0,0,0))
img = img.convert('RGBA')
draw = ImageDraw.Draw(img)
# read points
points = []
for i in range(len(polygon)):
points.append((int(polygon[i]['x']),int(polygon[i]['y'])))
draw.polygon(points, fill = (color))
return img
def progressBar(current, total, barLength = 20):
percent = float(current) * 100 / total
arrow = '-' * int(percent/100 * barLength - 1) + '>'
spaces = ' ' * (barLength - len(arrow))
print('Progress: [%s%s] %d %%' % (arrow, spaces, percent), end='\r')
def main(input_dir, output_dir, color_type='auto'):
if os.path.exists(input_dir) and os.path.exists(output_dir) and color_type in ['auto', 'manual', 'binar']:
input_path = pathlib.Path(input_dir)
label_paths_sorted = sorted(list(input_path.glob("*.json")))
for image_path in label_paths_sorted:
print('converting: {}'.format(os.path.basename(image_path)))
# open json file
data = open_json(image_path)
# create image list for Labels
img_list = []
# read original image
original_img = open_img(data[0]['Labeled Data'])
try:
width, height = original_img.size
except Exception:
print('Original image data not callable. Please provide image width and height.')
for i in range(len(data[0]['Label']['objects'])):
# read path and open image
img = open_img(data[0]['Label']['objects'][i]['instanceURI'])
# if path is not readable try to read polygon-data-points
if not img is None:
img = img_color(img, color_extractor(data[0]['Label']['objects'][i], color_type))
img_list.append(img)
else:
try:
# img = img_draw_polygon(img, data[0]['Label']['objects'][i]['polygon'], data[0]['Label']['objects'][i]['title'])
img = img_draw_polygon((width,height), data[0]['Label']['objects'][i]['polygon'], color_extractor(data[0]['Label']['objects'][i], color_type))
img_list.append(img)
except Exception:
print('Note: There are no available polygon-data-points & web-data-information for Label #{}.'.format(i))
# print current progress status
progressBar(i, len(data[0]['Label']['objects']))
img = img_list[0]
for i in range(1, len(img_list)):
img.paste(img_list[i], (0,0), mask= img_list[i])
img.save(output_dir + os.path.basename(image_path).replace('.json', '.png'))
else:
print('One of your given inputs is incorrect - please try again.')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="convert annotations from labelbox2png")
parser.add_argument("--input", help="input-directory")
parser.add_argument("--output", help="output-directory")
parser.add_argument("--color", help="binar, auto or manual")
args = parser.parse_args()
main(args.input, args.output, args.color)
To run it - just save this python-script and execute it in your command:
C:\Users>python script.py --input input_directory/ --output output_directory --color auto
With the input color you can modify the color-coding of your Labels. auto takes the colors from the JSON, manual you have to modify and binar white-labels everything.

google.api_core.exceptions.PermissionDenied: 403 Request had insufficient authentication scopes

Don't know how to ask what to fix here cause I don't know the problem. Is it the 1) google.api_core.exceptions.PermissionDenied: 403 Request had insufficient authentication scopes. 2) commented out init() towards the end. I was receiving the error: rm: cannot remove 'tmp': No such file or directory
Cleaning up...
whats here????
Finding people...
Traceback (most recent call last):
File "try-me.py", line 227, in <module>
timestamps_to_pull = analyze_labels(MOVIE_TO_PROCESS, BUCKET_NAME)
File "try-me.py", line 40, in analyze_labels
path, features=features, video_context=context)
File "/home/jackwhitely/.local/lib/python2.7/site-packages/google/cloud/videointelligence_v1/gapic/video_intelligence_service_client.py"
, line 202, in annotate_video
operation = self._annotate_video(request, retry=retry, timeout=timeout)
File "/home/jackwhitely/.local/lib/python2.7/site-packages/google/api_core/gapic_v1/method.py", line 139, in __call__
return wrapped_func(*args, **kwargs)
File "/home/jackwhitely/.local/lib/python2.7/site-packages/google/api_core/retry.py", line 260, in retry_wrapped_func
on_error=on_error,
File "/home/jackwhitely/.local/lib/python2.7/site-packages/google/api_core/retry.py", line 177, in retry_target
return target()
File "/home/jackwhitely/.local/lib/python2.7/site-packages/google/api_core/timeout.py", line 206, in func_with_timeout
return func(*args, **kwargs)
File "/home/jackwhitely/.local/lib/python2.7/site-packages/google/api_core/grpc_helpers.py", line 56, in error_remapped_callable
six.raise_from(exceptions.from_grpc_error(exc), exc)
File "/home/jackwhitely/.local/lib/python2.7/site-packages/six.py", line 737, in raise_from
raise value
google.api_core.exceptions.PermissionDenied: 403 Request had insufficient authentication scopes.
#--- The Original Code: http://amunategui.github.io/google-actor-recognition/index.html ---#
# --- Very Basic Instructions ---
# 1 - place a video clip in a bucket on your Google Cloud Storage and set permission to public
# 2 - run the code from the GCP cloud VM
# 3 - run the requirements.txt file (pip install -r requirements.txt)
# 4 - run video_processing.py clip_name bucket_name at the command prompt
# this will create tmp folder and under a series of folders including faces_found and text_found
# where it will store what it learned from your clip
# 5 - Don't forget to delete the clip (or remove public permission at the very least) and turn
# you VM off!
# If you have ffmpeg issues try this:
# sudo apt-get install ffmpeg
import glob, os, sys, io, skvideo.io, argparse, math, datetime, ffmpy, shutil, wikipedia
from google.cloud import videointelligence
from google.cloud import vision
from google.cloud import storage
from google.cloud.vision import types
from PIL import Image, ImageDraw
import numpy as np
def init():
# clean out directory structure
os.system('rm -r tmp')
def analyze_labels(movie_to_process, bucket_name):
path = 'gs://' + bucket_name + '/' + movie_to_process
""" Detects labels given a GCS path. """
video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.enums.Feature.LABEL_DETECTION]
mode = videointelligence.enums.LabelDetectionMode.SHOT_AND_FRAME_MODE
config = videointelligence.types.LabelDetectionConfig(
label_detection_mode=mode)
context = videointelligence.types.VideoContext(
label_detection_config=config)
operation = video_client.annotate_video(
path, features=features, video_context=context)
print('\nProcessing video for label annotations:')
result = operation.result(timeout=90)
print('\nFinished processing.')
frame_offsets = []
# Process frame level label annotations
frame_labels = result.annotation_results[0].frame_label_annotations
for i, frame_label in enumerate(frame_labels):
#if (frame_label.entity.description == 'person'):
print('Frame label description: {}'.format(
frame_label.entity.description))
for category_entity in frame_label.category_entities:
if (category_entity.description == 'person'):
print('\tLabel category description: {}'.format(
category_entity.description))
print(frame_label)
# Each frame_label_annotation has many frames,
# here we print information only about the first frame.
#for frame in frame_label.frames:
frame = frame_label.frames[0]
time_offset = (frame.time_offset.seconds +
frame.time_offset.nanos / 1e9)
print('\tFirst frame time offset: {}s'.format(time_offset))
print('\tFirst frame confidence: {}'.format(frame.confidence))
print('\n')
frame_offsets.append(time_offset)
return(sorted(set(frame_offsets)))
def extract_image_from_video(video_input, name_output, time_stamp):
ret = "Error"
try:
ret = os.system("ffmpeg -i " + video_input + " -ss " + time_stamp + " -frames:v 1 " + name_output)
# if all goes well FFMPEG will return 0
return ret
except ValueError:
return("Oops! error...")
def crop_image(input_image, output_image, start_x, start_y, width, height):
"""Pass input name image, output name image, x coordinate to start croping, y coordinate to start croping, width to crop, height to crop """
input_img = Image.open(input_image)
# give the image some buffer space
start_with_buffer_x = int(start_x - np.ceil(width/2))
start_with_buffer_y = int(start_y - np.ceil(height/2))
width_with_buffer = int(start_x + width + np.ceil(width/2))
height_with_buffer = int(start_y + height + np.ceil(height/2))
box = (start_with_buffer_x, start_with_buffer_y, width_with_buffer, height_with_buffer)
output_img = input_img.crop(box)
output_img.save(output_image +".png")
return (output_image +".png")
def detect_face(face_file, max_results=4):
# can you find a face and return coordinates
client = vision.ImageAnnotatorClient()
content = face_file.read()
image = types.Image(content=content)
# return coords of face
return client.face_detection(image=image).face_annotations
def highlight_faces(image, faces):
# Draws a polygon around the faces, then saves to output_filename.
faces_boxes = []
im = Image.open(image)
draw = ImageDraw.Draw(im)
for face in faces:
box = [(vertex.x, vertex.y)
for vertex in face.bounding_poly.vertices]
draw.line(box + [box[0]], width=5, fill='#00ff00')
faces_boxes.append([box[0][0], box[0][1], box[1][0] - box[0][0], box[3][1] - box[0][1]])
return (faces_boxes)
def annotate(path):
"""Returns web annotations given the path to an image."""
client = vision.ImageAnnotatorClient()
if path.startswith('http') or path.startswith('gs:'):
image = types.Image()
image.source.image_uri = path
else:
with io.open(path, 'rb') as image_file:
content = image_file.read()
image = types.Image(content=content)
web_detection = client.web_detection(image=image).web_detection
return web_detection
def report(annotations, max_report=5):
"""Prints detected features in the provided web annotations."""
names = []
if annotations.web_entities:
print ('\n{} Web entities found: '.format(
len(annotations.web_entities)))
count = 0
for entity in annotations.web_entities:
print('Score : {}'.format(entity.score))
print('Description: {}'.format(entity.description))
names.append(entity.description)
count += 1
if count >=max_report:
break;
return names
def get_stills(movie_to_process, bucket_name, timestamps_to_pull):
video_location = 'https://storage.googleapis.com/' + bucket_name + '/' + movie_to_process
storage_client = storage.Client()
max_results = 3
timestamps_to_pull_tmp = timestamps_to_pull + [x + 0.15 for x in timestamps_to_pull[:-1]] + [x - 0.15 for x in timestamps_to_pull[1:]]
# clear out stills folder
if len(timestamps_to_pull_tmp) > 0:
# create directory structure
os.system('mkdir tmp')
os.system('mkdir tmp/faces_found')
os.system('mkdir tmp/text_found')
os.system('mkdir tmp/face_images')
filepath = 'tmp/'
# make stills
cnt_ = 0
for ttp in timestamps_to_pull_tmp:
# get the still image at that timestamp
time_stamp = str(datetime.timedelta(seconds=ttp))
file = "still_" + str(cnt_) + ".png"
filePathAndName = filepath + file
print('filename: ' + time_stamp)
ret = extract_image_from_video(video_input = video_location, name_output = filePathAndName, time_stamp = time_stamp)
cnt_ += 1
# find face on still image
with open(filePathAndName, 'rb') as image:
faces = detect_face(image, max_results)
print('Found {} face{}'.format(
len(faces), '' if len(faces) == 1 else 's'))
print('Looking for a face {}'.format(filePathAndName))
# Reset the file pointer, so we can read the file again
image.seek(0)
faces_boxes = highlight_faces(filePathAndName, faces) #, filePathAndName)
print('faces_boxes:', faces_boxes)
if len(faces_boxes) > 0:
# image had a face
count = 0
for face_box in faces_boxes:
# cv2.imwrite("frame%d.jpg" % count, image) # save frame as JPEG file
saved_name = crop_image(filePathAndName, "tmp/face_images/" + file.split('.')[0] + str(count) + '_faces', face_box[0], face_box[1], face_box[2], face_box[3])
count += 1
# get actors name
potential_names = report(annotate(saved_name),2)
print('potential_names: ', potential_names)
# does the first have two words - as in first and last name?
if (len(potential_names[0].split()) == 2):
# we have a winner
new_name = 'tmp/faces_found/' + potential_names[0] + '.png'
shutil.copy(saved_name,new_name)
# extract wiki bio
rez = wikipedia.page(potential_names[0]).content
# keep only intro paragraph
with open('tmp/text_found/' + potential_names[0] + ".txt", "w") as text_file:
text_file.write(rez.split('\n\n')[0] + " (Source: Wikipedia.com)")
BUCKET_NAME = ''
MOVIE_TO_PROCESS = ''
if __name__ == "__main__":
if len(sys.argv) == 3:
MOVIE_TO_PROCESS = sys.argv[1]
BUCKET_NAME = sys.argv[2]
# start things off clean
print('Cleaning up...')
print ('whats here????')
# init()
print('Finding people...')
# use video intelligence to find high probability of people being visible
timestamps_to_pull = analyze_labels(MOVIE_TO_PROCESS, BUCKET_NAME)
print('Processing people...')
get_stills(MOVIE_TO_PROCESS, BUCKET_NAME, timestamps_to_pull)
print('All done...')
else:
print('Wrong argument inputs')
Some recommendations for each of your points:
1.- PermissionDenied: 403 Request had insufficient authentication scopes.
First step is to confirm that your APIs are enabled, Videointelligence and Vision.
If error persists, you can review the following recommendations depending on how you authenticate the access to Google Cloud:
Using a Service account. You need to generate its credentials in a JSON file, then you have to set up the environment variable GOOGLE_APPLICATION_CREDENTIALS. An easy way to do it from your code is: os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path/to/imgtotext.json"
Using an API Key. To use this mechanism you need to construct a REST request in your code, so your code has to be adapted.
2.- Cannot remove 'tmp': No such file or directory
This is because the command looks for a directory tmp in the same directory where you execute your python program, for sure it doesn't exist. Try to use an absolute path, for example /home/myuser/ml-cloud/tmp.

How to create and save pairwise hamming distances of image perceptual hashes for imput to clustering algorithm

hoping somebody can provide guidance as to how to compute pairwise hamming distance of a bunch of hashes and then cluster them. I don't care so much as to performance as from looking at what I am doing and what I want to do its going to be slow no matter what and its not something that will be repeatedly run over and over.
So...in a nutshell I had mistakenly erased 1000's of photos off a drive and had no backups (I know...bad practise). Using various tools I was able to recover a very high % of them from the drive but was left with hundreds of 1000's of photos. Due to the techniques used for recovering some of the photos (such as file carving) some of the images are corrupt to various degrees, others were identical copies, and yet others were essentially identical visually but byte for byte were different.
What I am looking at doing to help the situation is the following:
check each image and identify if image file is structurally corrupt or not (done)
generate perceptual hashes (fingerprint) for each image so that images can be compared for similarity and clustered (fingerprinting part is done)
calculate the pairwise distance of the fingerprints
cluster the pairwise distance so that similar images can be viewed together to aide manual cleanup
In the script attached you will notice a couple of places I calculate hashes, I will explain as to not cause confusion...
for images that are supported by PIL I generate three hashes, 1st for original image, 2nd is rotated 90 degrees, and 3rd is rotated 180 degrees. This was done so that when the pairwise calculations are done I can account for images that just vary in orientation.
for raw images not supported by PIL I instead favour hashes that are generated from the extracted embedded preview image. I did this instead of using the raw image because in the case of a corrupt raw image file there was a high chance of probability that the preview image was intact due to its smaller size and thus would be better for identifying if the image is similar to others
other place hashes are generated is during a last ditch effort to identify corrupt raw images. I would compare hashes of the extracted/converted raw image to that of the extracted embedded preview image and if the similarity does not meet a defined threshold it is assumed that there is probably corruption of the raw file as a whole.
What I need guidance on is how to accomplish the following:
take the three hashes I have for each image and calculate hamming pairwise distances
for each image comparison keep only the hamming distance that is most similar
feed the results into scipy hierarchical clustering so that I can group similar images
I am just learning Python so that is part of the my challenge... I think from what I have got from google I can do this by first getting the pairwise distances using scipy.spatial.distance.pdist, then process this to keep the most similar distance for each image comparison, then feed this to a scipy clustering function. But I cannot figure how to organize this and provide things in the proper format etc. Can anyone provide some guidance on this?
Here is my current script for reference in case anyone else finds it interesting that I will need to alter for storing some sort of dictionary of hashes or maybe some sort of on disk storage.
from PIL import Image
from PIL import ImageFile
import os, sys, imagehash, pyexiv2, rawpy, re
from tempfile import NamedTemporaryFile
from subprocess import check_call, call
# allow PIL to load truncated images (so that perceptual hashes can be created for truncated/damaged images still)
ImageFile.LOAD_TRUNCATED_IMAGES = True
# image files this script will handle
# PIL supported image formats
stdimageext = ('.jpg','.jpeg', '.bmp', '.png', '.gif', '.tif', '.tiff')
# libraw/ufraw supported formats
rawimageext = ('.nef', '.dng', '.tif', '.tiff')
devnull = open(os.devnull, 'w')
corruptRegex = re.compile(r'_\[.+\]\..{3,4}$')
for root, dirs, files in os.walk(sys.argv[1]):
for filename in files:
ext = os.path.splitext(filename.lower())[1]
filepath = os.path.join(root, filename)
if ext in (stdimageext + rawimageext):
hashes = [None] * 4
print(filename)
# reset corrupt string
corrupt_str = None
if ext in (stdimageext):
metadata = pyexiv2.ImageMetadata(filepath)
metadata.read()
rotate = 0
try:
im = Image.open(filepath)
except:
None
else:
for x in range(3):
hashes[x] = imagehash.dhash(im.rotate(90 * (x + 1)),32)
# use jpeginfo against all jpg images as its pretty accurate
if ext in ('.jpg','.jpeg'):
rc = 0
rc = call(["jpeginfo", "--check", filepath], stdout=devnull, stderr=devnull)
if rc == 1:
corrupt_str = 'JpegInfo'
if corrupt_str is None:
try:
im = Image.open(filepath)
im.verify()
except:
e = sys.exc_info()[0]
corrupt_str = 'PIL_Verify'
else:
try:
im = Image.open(filepath)
im.load()
except:
e = sys.exc_info()[0]
corrupt_str = 'PIL_Load'
# raw image processing
else:
# extract largest embedded preview image first
metadata_orig = pyexiv2.ImageMetadata(filepath)
metadata_orig.read()
if len(metadata_orig.previews) > 0:
preview = metadata_orig.previews[-1]
# save preview to temp file
temp_preview = NamedTemporaryFile()
preview.write_to_file(temp_preview.name)
os.rename(temp_preview.name + preview.extension, temp_preview.name)
rotate = 0
try:
im = Image.open(temp_preview.name)
except:
None
else:
for x in range(4):
hashes[x] = imagehash.dhash(im.rotate(90 * (x + 1)),32)
# close temp file
temp_preview.close()
# try to load raw using libraw via rawpy first,
# generally if libraw can't load it then ufraw extraction would also fail
try:
with rawpy.imread(filepath) as im:
None
except:
e = sys.exc_info()[0]
corrupt_str = 'Libraw_Load'
else:
# as a final last ditch effort compare perceptual hashes of extracted
# raw and embedded preview to detect possible internal corruption
if len(metadata_orig.previews) > 0:
# extract and convert raw to jpeg image using ufraw
temp_raw = NamedTemporaryFile(suffix='.jpg')
try:
check_call(['ufraw-batch', '--wb=camera', '--rotate=camera', '--out-type=jpg', '--compression=95', '--noexif', '--lensfun=none', '--output=' + temp_raw.name, '--overwrite', '--silent', filepath],stdout=devnull, stderr=devnull)
except:
e = sys.exc_info()[0]
corrupt_str = 'Ufraw-conv'
else:
rhash = imagehash.dhash(Image.open(temp_raw.name),32)
# compare preview with raw image and compute the most similar hamming distance (best)
hamdiff = .0
for h in range(4):
# calculate hamming distance to compare similarity
hamdiff = max((256 - sum(bool(ord(ch1) - ord(ch2)) for ch1, ch2 in zip(str(hashes[h]), str(rhash))))/256,hamdiff)
if hamdiff < .7: # raw file is probably corrupt
corrupt_str = 'hash' + str(round(hamdiff*100,2))
# close temp files
temp_raw.close()
print(hamdiff)
print(rhash)
print(hashes[0])
print(hashes[1])
print(hashes[2])
print(hashes[3])
# prefix file if corruption was detected ensuring that existing files already prefixed are re prefixed
mo = corruptRegex.search(filename)
if corrupt_str is not None:
if mo is not None:
os.rename(filepath,os.path.join(root, re.sub(corruptRegex, '_[' + corrupt_str + ']', filename) + ext))
else:
os.rename(filepath,os.path.join(root, os.path.splitext(filename)[0] + '_[' + corrupt_str + ']' + ext))
else:
if mo is not None:
os.rename(filepath,os.path.join(root, re.sub(corruptRegex, '', filename) + ext))
EDITED
Just want to provide an update with what I came up with in the end that seems to work quite nicely for my intended purpose and maybe it will prove useful for other users in a similar situation. The script can still use some polishing but otherwise all the meat is there. As I am green with respect using Python if anyone see something that can improved greatly please let me know.
The script does the following:
attempts to detect image corruption in terms of file structure using various methods. For raw image formats (NEF, DNG, TIF) sometimes I found that a corrupt image could still load fine so I decided to hash both the preview image and an extracted .jpg of the raw image and compare the hashes and if they were not similar enough I assume that image is corrupted in some form.
create perceptual hashes for each image that could be loaded. Three are created for the base file (original, original rotated 90, original rotated 180). In addition, for raw images an additional 3 hashes were created for the extracted preview image, this was done so that in cases where the raw image was corrupted we would still have hashes based on the full image (assuming the preview is fine).
for images that are identified as corrupt they are renamed with a suffix that indicates that are corrupt and what determined it.
Pairwise hamming distances were computed by comparing hashes against all file pairs and stored in a numpy array.
square form of pairwise distances are fed to fastcluster for clustering
output from fastcluster is used to generate a dendrogram to visualize clusters of similar images
I save the numpy array to disk so that I can later rerun the fastcluster/dendrogram part without recomputing the hashes for each file which is slow. This is something I have to alter the script to allow yet....
from PIL import Image
from PIL import ImageFile
import os, sys, imagehash, pyexiv2, rawpy, re
from tempfile import NamedTemporaryFile
from subprocess import check_call, call
import numpy as np
from scipy.cluster.hierarchy import dendrogram
from scipy.spatial.distance import squareform
import fastcluster
import matplotlib.pyplot as plt
# allow PIL to load truncated images (so that perceptual hashes can be created for truncated/damaged images still)
ImageFile.LOAD_TRUNCATED_IMAGES = True
# image files this script will handle
# PIL supported image formats
stdimageext = ('.jpg','.jpeg', '.bmp', '.png', '.gif', '.tif', '.tiff')
# libraw/ufraw supported formats
rawimageext = ('.nef', '.dng', '.tif', '.tiff')
devnull = open(os.devnull, 'w')
corruptRegex = re.compile(r'_\[.+\]\..{3,4}$')
hashes = []
filelist = []
for root, _, files in os.walk(sys.argv[1]):
for filename in files:
ext = os.path.splitext(filename.lower())[1]
relpath = os.path.relpath(root, sys.argv[1])
filepath = os.path.join(root, filename)
if ext in (stdimageext + rawimageext):
hashes_tmp = []
rhash = []
# reset corrupt string
corrupt_str = None
if ext in (stdimageext):
try:
im=Image.open(filepath)
for x in range(3):
hashes_tmp.append(str(imagehash.dhash(im.rotate(90 * x, expand=1),32)))
except:
None
# use jpeginfo against all jpg images as its pretty accurate
if ext in ('.jpg','.jpeg'):
rc = 0
rc = call(["jpeginfo", "--check", filepath], stdout=devnull, stderr=devnull)
if rc == 1:
corrupt_str = 'JpegInfo'
if corrupt_str is None:
try:
im = Image.open(filepath)
im.verify()
except:
e = sys.exc_info()[0]
corrupt_str = 'PIL_Verify'
else:
try:
im = Image.open(filepath)
im.load()
except:
e = sys.exc_info()[0]
corrupt_str = 'PIL_Load'
# raw image processing
if ext in (rawimageext):
# extract largest embedded preview image first
metadata_orig = pyexiv2.ImageMetadata(filepath)
metadata_orig.read()
if len(metadata_orig.previews) > 0:
preview = metadata_orig.previews[-1]
# save preview to temp file
temp_preview = NamedTemporaryFile()
preview.write_to_file(temp_preview.name)
os.rename(temp_preview.name + preview.extension, temp_preview.name)
try:
im = Image.open(temp_preview.name)
for x in range(3):
hashes_tmp.append(str(imagehash.dhash(im.rotate(90 * x,expand=1),32)))
except:
None
# try to load raw using libraw via rawpy first,
# generally if libraw can't load it then ufraw extraction would also fail
try:
im = rawpy.imread(filepath)
except:
e = sys.exc_info()[0]
corrupt_str = 'Libraw_Load'
else:
# as a final last ditch effort compare perceptual hashes of extracted
# raw and embedded preview to detect possible internal corruption
# extract and convert raw to jpeg image using ufraw
temp_raw = NamedTemporaryFile(suffix='.jpg')
try:
check_call(['ufraw-batch', '--wb=camera', '--rotate=camera', '--out-type=jpg', '--compression=95', '--noexif', '--lensfun=none', '--output=' + temp_raw.name, '--overwrite', '--silent', filepath],stdout=devnull, stderr=devnull)
except:
e = sys.exc_info()[0]
corrupt_str = 'Ufraw-conv'
else:
try:
im = Image.open(temp_raw.name)
for x in range(3):
rhash.append(str(imagehash.dhash(im.rotate(90 * x,expand=1),32)))
except:
None
# compare preview with raw image and compute the most similar hamming distance (best)
if len(hashes_tmp) > 0 and len(rhash) > 0:
hamdiff = 1
for rh in rhash:
# calculate hamming distance to compare similarity
hamdiff = min(hamdiff,(sum(bool(ord(ch1) - ord(ch2)) for ch1, ch2 in zip(hashes_tmp[0], rh))/len(hashes_tmp[0])))
if hamdiff > .3: # raw file is probably corrupt
corrupt_str = 'hash' + str(round(hamdiff*100,2))
hashes_tmp = hashes_tmp + rhash
# prefix file if corruption was detected ensuring that existing files already prefixed are re prefixed
mo = corruptRegex.search(filename)
newfilename = None
if corrupt_str is not None:
if mo is not None:
newfilename = re.sub(corruptRegex, '_[' + corrupt_str + ']', filename) + ext
else:
newfilename = os.path.splitext(filename)[0] + '_[' + corrupt_str + ']' + ext
else:
if mo is not None:
newfilename = re.sub(corruptRegex, '', filename) + ext
if newfilename is not None:
os.rename(filepath,os.path.join(root, newfilename))
if len(hashes_tmp) > 0:
hashes.append(hashes_tmp)
if newfilename is not None:
filelist.append(os.path.join(relpath, newfilename))
else:
filelist.append(os.path.join(relpath, filename))
print(len(filelist))
print(len(hashes))
a = np.empty(shape=(len(filelist),len(filelist)))
for hash_idx1, hash in enumerate(hashes):
a[hash_idx1,hash_idx1] = 0
hash_idx2 = hash_idx1 + 1
while hash_idx2 < len(hashes):
ham_dist = 1
for h1 in hash:
for h2 in hashes[hash_idx2]:
ham_dist = min(ham_dist, (sum(bool(ord(ch1) - ord(ch2)) for ch1, ch2 in zip(h1, h2)))/len(h1))
a[hash_idx1,hash_idx2] = ham_dist
a[hash_idx2,hash_idx1] = ham_dist
hash_idx2 = hash_idx2 + 1
print(a)
X = squareform(a)
print(X)
linkage = fastcluster.single(X)
clustdict = {i:[i] for i in range(len(linkage)+1)}
fig = plt.figure(figsize=(25,25))
plt.title('test title')
plt.xlabel('perpetual hash hamming distance')
plt.axvline(x=.15,c='red',linestyle='--')
dg = dendrogram(linkage, labels=filelist, orientation='right', show_leaf_counts=True)
ax = fig.gca()
ax.set_xlim(-.01,ax.get_xlim()[1])
plt.show
plt.savefig('foo1.pdf', bbox_inches='tight', dpi=100)
with open('numpyarray.npy','wb') as f:
np.save(f,a)
It took awhile...but I figured things out eventually and got a script that does a pretty good job of identifying if an image is corrupt and then uses perceptual hashes to try and group similar images together.
from PIL import Image, ImageFile
import os, sys, imagehash, pyexiv2, rawpy, re
from tempfile import NamedTemporaryFile
from subprocess import Popen, PIPE
import shlex
import numpy as np
from scipy.cluster.hierarchy import dendrogram, fcluster
from scipy.spatial.distance import squareform
import fastcluster
#import matplotlib.pyplot as plt
import math
import string
from wand.image import Image as wImage
import wand.exceptions
from io import BytesIO
from datetime import datetime
#import fd_table_status
def redirect_stdout():
print("Redirecting stdout and stderr")
sys.stdout.flush() # <--- important when redirecting to files
sys.stderr.flush()
newstdout = os.dup(1)
newstderr = os.dup(2)
devnull = os.open(os.devnull, os.O_WRONLY)
devnull2 = os.open(os.devnull, os.O_WRONLY)
os.dup2(devnull, 1)
os.dup2(devnull2,2)
os.close(devnull)
os.close(devnull2)
sys.stdout = os.fdopen(newstdout, 'w')
sys.stderr = os.fdopen(newstderr, 'w')
redirect_stdout()
def ct(linkage_matrix,flist,score):
cluster_id = []
for fidx, file_ in enumerate(flist):
link_ = np.where(linkage_matrix[:,:2] == fidx)[0]
if len(link_) == 1:
link = link_[0]
if linkage_matrix[link][2] <= score:
fcluster_idx = str(link).zfill(len(str(len(linkage_matrix))))
while True:
match = np.where(linkage_matrix[:,:2] == link+1+len(linkage_matrix))[0]
if len(match) == 1:
link = match[0]
link_d = linkage_matrix[link]
if link_d[2] <= score:
fcluster_idx = str(match[0]).zfill(len(str(len(linkage_matrix)))) + fcluster_idx
else:
break
else:
break
else:
fcluster_idx = None
cluster_id.append(fcluster_idx)
return cluster_id
def get_exitcode_stdout_stderr(cmd):
"""
Execute the external command and get its exitcode, stdout and stderr.
"""
args = shlex.split(cmd)
proc = Popen(args, stdout=PIPE, stderr=PIPE, close_fds=True)
out, err = proc.communicate()
exitcode = proc.returncode
del proc
return exitcode, out, err
if os.path.isdir(sys.argv[1]):
start_time = datetime.now()
# allow PIL to load truncated images (so that perceptual hashes can be created for truncated/damaged images still)
ImageFile.LOAD_TRUNCATED_IMAGES = True
# image files this script will handle
# PIL supported image formats
stdimageext = ('.jpg','.jpeg', '.bmp', '.png', '.gif', '.tif', '.tiff')
# libraw/ufraw supported formats
rawimageext = ('.nef', '.dng', '.tif', '.tiff')
corruptRegex = re.compile(r'_\[.+\]\..{3,4}$')
groupRegex = re.compile(r'^\[\d+\]_')
ufrawRegex = re.compile(r'Corrupt data near|Unexpected end of file|has the wrong dimensions!|Cannot open file|Cannot decode file|requests a nonexistent image!')
for subdirs,dirs,files in os.walk(sys.argv[1]):
files.clear()
dirs.clear()
for root,_,files in os.walk(subdirs):
print('\n******** Processing files in ' + root)
hashes = []
w_hash = []
w_hash_idx = []
filelist = []
files_ = []
cnt = 0
for f in files:
#cnt = cnt + 1
#if cnt < 10:
files_.append(f)
continue
cnt = 0
for f_idx, fname in enumerate(files_):
e=None
ext = os.path.splitext(fname.lower())[1]
filepath = os.path.join(root, fname)
imformat = ''
hashes_tmp = []
# reset corrupt string
corrupt_str = None
if ext in (stdimageext + rawimageext):
print(str(int(round(((f_idx+1)/len(files_))*100))) + '%' + ' : ' + fname + '....', end='', flush=True)
try:
with wImage(filename=filepath) as im:
imformat = '.' + im.format.lower()
ext = imformat if imformat is not '' else ext
with im.convert('jpeg') as converted:
jpeg_bin = converted.make_blob()
with Image.open(BytesIO(jpeg_bin)) as im2:
hash_image = []
for x in range(3):
print('.',end='',flush=True)
hash_i = str(imagehash.dhash(im2.rotate(90 * x, expand=1),32))
if ''.join(set(hash_i)) != '0':
hash_image.append(hash_i)
if hash_image:
hash_image.append(1)
hashes_tmp.append(hash_image)
except:
e = sys.exc_info()[0]
errcode = str([k for k, v in wand.exceptions.TYPE_MAP.items() if v == e][0]).zfill(3)
if int(errcode[-2:]) in (15,25,30,35,40,50,55):
corrupt_str = 'magick'
finally:
try:
im.close()
except:
pass
try:
im2.close()
except:
pass
if ext in (stdimageext):
try:
with Image.open(filepath) as im:
hash_image = []
for x in range(3):
print('.',end='',flush=True)
hash_i = str(imagehash.dhash(im.rotate(90 * x, expand=1),32))
if ''.join(set(hash_i)) != '0':
hash_image.append(hash_i)
if hash_image:
hash_image.append(2)
hashes_tmp.append(hash_image)
except:
pass
finally:
try:
im.close()
except:
pass
# use jpeginfo against all jpg images as its pretty accurate
if ext in ('.jpg','.jpeg'):
#rc = 0
print('.',end='',flush=True)
cmd = 'jpeginfo --check "' + filepath + '"'
exitcode, out, err = get_exitcode_stdout_stderr(cmd)
#rc = call(["jpeginfo", "--check", filepath], stdout=DEVNULL, stderr=DEVNULL, close_fds=True)
if exitcode == 1:
corrupt_str = 'JpegInfo' if corrupt_str == None else corrupt_str
#del rc
if corrupt_str is None:
try:
with Image.open(filepath) as im:
print('.',end='',flush=True)
im.verify()
except:
e = sys.exc_info()[0]
corrupt_str = 'PIL_Verify' if corrupt_str == None else corrupt_str
else:
try:
with Image.open(filepath) as im:
print('.',end='',flush=True)
temp = im.copy()
im.load()
except:
e = sys.exc_info()[0]
corrupt_str = 'PIL_Load' if corrupt_str == None else corrupt_str
finally:
try:
temp.close()
except:
pass
try:
im.close()
except:
pass
finally:
try:
im.close()
except:
pass
try:
temp.close()
except:
pass
# raw image processing
if ext in (rawimageext):
print('.',end='',flush=True)
# try to load raw using libraw via rawpy first,
# generally if libraw can't load it then ufraw extraction would also fail
if corrupt_str == None:
try:
with rawpy.imread(filepath) as raw:
rgb = raw.postprocess(use_camera_wb=True)
temp_raw = NamedTemporaryFile(suffix='.jpg')
Image.fromarray(rgb).save(temp_raw.name)
with Image.open(temp_raw.name) as im:
hash_image = []
for x in range(3):
print('.',end='',flush=True)
hash_i = str(imagehash.dhash(im.rotate(90 * x, expand=1),32))
if ''.join(set(hash_i)) != '0':
hash_image.append(hash_i)
if hash_image:
hash_image.append(3)
hashes_tmp.append(hash_image)
except(rawpy.LibRawFatalError):
e = sys.exc_info()[1]
corrupt_str = 'Libraw_FE'
except(rawpy.LibRawNonFatalError):
e = sys.exc_info()[1]
corrupt_str = 'Libraw_NFE'
except:
#print(sys.exc_info())
corrupt_str = 'Libraw'
finally:
try:
im.close()
except:
pass
try:
temp_raw.close()
except:
pass
try:
raw.close()
except:
pass
if corrupt_str == None:
# as a final last ditch effort compare perceptual hashes of extracted
# raw and embedded preview to detect possible internal corruption
# extract and convert raw to jpeg image using ufraw
temp_raw = NamedTemporaryFile(suffix='.jpg')
#rc = 0
cmd = 'ufraw-batch --wb=camera --rotate=camera --out-type=jpg --compression=95 --noexif --lensfun=none --auto-crop --output=' + temp_raw.name + ' --overwrite "' + filepath + '"'
print('.',end='',flush=True)
exitcode, out, err = get_exitcode_stdout_stderr(cmd)
if exitcode == 1 or ufrawRegex.search(str(err)) is not None:
corrupt_str = 'Ufraw' if corrupt_str is None else corrupt_str
tmpfilesize = os.stat(temp_raw.name).st_size
if tmpfilesize > 0:
try:
with Image.open(temp_raw.name) as im:
hash_image = []
for x in range(3):
print('.',end='',flush=True)
hash_i = str(imagehash.dhash(im.rotate(90 * x, expand=1),32))
if ''.join(set(hash_i)) != '0':
hash_image.append(hash_i)
if hash_image:
hash_image.append(4)
hashes_tmp.append(hash_image)
except:
pass
finally:
try:
im.close()
except:
pass
try:
temp_raw.close()
except:
pass
# attempt to extract preview images
imfile = filepath
try:
with pyexiv2.ImageMetadata(imfile) as metadata_orig:
metadata_orig.read()
#for i,p in enumerate(metadata_orig.previews):
if metadata_orig.previews:
preview = metadata_orig.previews[-1]
# save preview to temp file
temp_preview = NamedTemporaryFile()
preview.write_to_file(temp_preview.name)
os.rename(temp_preview.name + preview.extension, temp_preview.name)
try:
with Image.open(temp_preview.name) as im:
hash_image = []
for x in range(3):
print('.',end='',flush=True)
hash_i = str(imagehash.dhash(im.rotate(90 * x, expand=1),32))
if ''.join(set(hash_i)) != '0':
hash_image.append(hash_i)
if hash_image:
hash_image.append(5)
hashes_tmp.append(hash_image)
except:
pass
finally:
try:
temp_preview.close()
except:
pass
try:
im.close()
except:
pass
except:
pass
finally:
try:
metadata_orig.close()
except:
pass
# compare hashes for all images that were found or extracted and find most dissimilar hamming distance (worst)
if len(hashes_tmp) > 1:
#print('checking_hashes')
print('.',end='',flush=True)
scores = []
for h_idx, hash in enumerate(hashes_tmp):
i = h_idx + 1
while i < len(hashes_tmp):
ham_dist = 1
for h1 in hash[:-1]:
for h2 in hashes_tmp[i][:-1]:
ham_dist = min(ham_dist, (sum(bool(ord(ch1) - ord(ch2)) for ch1, ch2 in zip(h1, h2)))/len(h1))
if (hash[-1] == 5 and hashes_tmp[i][-1] != 5) or (hash[-1] != 5 and hashes_tmp[i][-1] == 5):
scores.append([ham_dist,hash[-1],hashes_tmp[i][-1]])
i = i + 1
if scores:
worst = sorted(scores, key = lambda x: x[0])[-1]
if worst[0] > 0.3:
worst1 = str(worst[1])
worst2 = str(worst[2])
corrupt_str = 'hash' + str(round(worst[0]*100,2)) + '_' + worst1 + '-' + worst2 if corrupt_str == None else corrupt_str
# prefix file if corruption was detected ensuring that existing files already prefixed are re prefixed
mo = corruptRegex.search(fname)
newfilename = None
if corrupt_str is not None:
print('Corrupt: ' + corrupt_str)
if mo is not None:
newfilename = re.sub(corruptRegex, '_[' + corrupt_str + ']', fname) + ext
else:
newfilename = os.path.splitext(fname)[0] + '_[' + corrupt_str + ']' + ext
else:
print('OK!')
if mo is not None:
newfilename = re.sub(corruptRegex, '', fname) + ext
# remove group index from name if present, this will be assigned in the next step if needed
newfilename = newfilename if newfilename is not None else fname
mo = groupRegex.search(newfilename)
if mo is not None:
newfilename = re.sub(groupRegex, '', newfilename)
if hashes_tmp:
# set function unduplicates flattened list
hashes.append(set([item for sublist in hashes_tmp for item in sublist[:-1]]))
filelist.append([root,fname,newfilename, len(hashes_tmp)])
print('******** Grouping similar images... ************')
if len(hashes) > 1:
scores = []
for h_idx, hash in enumerate(hashes):
i = h_idx + 1
while i < len(hashes):
ham_dist = 1
for h1 in hash:
for h2 in hashes[i]:
ham_dist = min(ham_dist, (sum(bool(ord(ch1) - ord(ch2)) for ch1, ch2 in zip(h1, h2)))/len(h1))
scores.append(ham_dist)
i = i + 1
X = np.array(scores)
linkage = fastcluster.single(X)
w_hash_idx = [el_idx for el_idx, el in enumerate(filelist) if el[3] > 0]
w_hash = [filelist[i] for i in w_hash_idx]
test=ct(linkage,[el[2] for el in w_hash],.2)
for i, prfx in enumerate(test):
curfilename = w_hash[i][2]
mo = groupRegex.search(curfilename)
newfilename = None
if prfx is not None:
if mo is not None:
newfilename = re.sub(groupRegex, '[' + prfx + ']_', curfilename)
else:
newfilename = '[' + prfx + ']_' + curfilename
else:
if mo is not None:
newfilename = re.sub(groupRegex, '', curfilename)
# if newfilename is not None:
filelist[w_hash_idx[i]][2] = newfilename if newfilename is not None else curfilename
#fig = plt.figure(figsize=(25,25))
#plt.title(root)
#plt.xlabel('perpetual hash hamming distance')
#plt.axvline(x=.15,c='red',linestyle='--')
#dg = dendrogram(linkage, labels=[el[2] for el in w_hash], orientation='right', show_leaf_counts=True)
#ax = fig.gca()
#ax.set_xlim(-.02,ax.get_xlim()[1])
#plt.show
#plt.savefig(os.path.join(root,'dendrogram.pdf'), bbox_inches='tight', dpi=100)
w_hash.clear()
w_hash_idx.clear()
print('******** Renameing file if applicable... ************')
for fr in filelist:
if fr[1] != fr[2]:
#print(fr[1] + ' -- ' + fr[2])
path = fr[0]
os.rename(os.path.join(path,fr[1]),os.path.join(path,fr[2]))
filelist.clear()
duration = datetime.now() - start_time
days = divmod(duration.total_seconds(), 86400) # Get days (without [0]!)
hours = divmod(days[1], 3600) # Use remainder of days to calc hours
minutes = divmod(hours[1], 60) # Use remainder of hours to calc minutes
seconds = divmod(minutes[1], 1) # Use remainder of minutes to calc seconds
print("Time to complete: %d days, %d:%d:%d" % (days[0], hours[0], minutes[0], seconds[0]))

Categories