Lucas-Kanade image alignment algorithm implementation not converging? - python

I've recently been attempting to implement the Lucas-Kanade algorithm for image alignment, as detailed in this paper here:
I've managed to implement the algorithm detailed in page 4 of the paper I linked, but the loss doesn't seem to converge. I've been looking over my code and my math, and can't seem to figure out where I might be going wrong.
What I've tried so far is implementing the entire algorithm, and re-doing my math for calculating the Jacobian of the warp, as well as just general checking of my code.
My code is below, as well as a more readable version on Pastebin:
import cv2
import numpy as np
import matplotlib.pyplot as plt
def calculate_steepest_descent(grad_x_warped, grad_y_warped, h):
rows, columns = grad_x_warped.shape
steepest_descent = np.zeros((rows, columns, 8))
warp_jacobian = np.zeros((2, 8)) # 2 x 8 because it's a homography, would be 2 x 6 if it was affine
current_gradient = np.zeros((1, 2))
# Convert homography matrix into parameter array for better readability with the math functions later
p = h.flatten()
for y in range(rows):
for x in range(columns):
# Calculate Jacobian of the warp at each pixel, which contains the partial derivatives of the
# warp parameters with respect to x and y coordinates, evaluated at the current value
# of parameters
common_denominator = (p[6]*x + p[7]*y + 1)
warp_jacobian[0, 0] = (x) / common_denominator
warp_jacobian[0, 1] = (y) / common_denominator
warp_jacobian[0, 2] = (1) / common_denominator
warp_jacobian[0, 3] = 0
warp_jacobian[0, 4] = 0
warp_jacobian[0, 5] = 0
warp_jacobian[0, 6] = (-(p[0]*(x**2) + p[1]*x*y + p[2]*x)) / (common_denominator ** 2)
warp_jacobian[0, 7] = (-(p[1]*(y**2) + p[0]*x*y + p[2]*y)) / (common_denominator ** 2)
warp_jacobian[1, 0] = 0
warp_jacobian[1, 1] = 0
warp_jacobian[1, 2] = 0
warp_jacobian[1, 3] = (x) / common_denominator
warp_jacobian[1, 4] = (y) / common_denominator
warp_jacobian[1, 5] = (1) / common_denominator
warp_jacobian[1, 6] = (-(p[3]*(x**2) + p[4]*x*y + p[5]*x)) / (common_denominator ** 2)
warp_jacobian[1, 7] = (-(p[4]*(y**2) + p[3]*x*y + p[5]*y)) / (common_denominator ** 2)
# Get the x and y gradient intensity values corresponding to the current pixel location
current_gradient[0, 0] = grad_x_warped[y, x]
current_gradient[0, 1] = grad_y_warped[y, x]
# Calculate full Jacobian (aka steepest descent image) at current pixel value
steepest_descent[y, x, :] =, warp_jacobian)
return steepest_descent
def calculate_hessian(steepest_descent):
rows, columns, channels = steepest_descent.shape
hessian = np.zeros((channels, channels))
for y in range(rows):
for x in range(columns):
steepest_descent_single = steepest_descent[y, x, :][np.newaxis, :]
steepest_descent_single_transpose = np.transpose(steepest_descent_single)
hessian_current =, steepest_descent_single)
hessian += hessian_current
return hessian
def calculate_sd_param_updates(steepest_descent, img_error):
rows, columns, channels = steepest_descent.shape
sd_param_updates = np.zeros((8, 1))
for y in range(rows):
for x in range(columns):
steepest_descent_single = steepest_descent[y, x, :][np.newaxis, :]
steepest_descent_single_transpose = np.transpose(steepest_descent_single)
img_error_single = img_error[y, x]
sd_param_updates +=, img_error_single)
return sd_param_updates
def calculate_final_param_updates(sd_param_updates, hessian):
hessian_inverse = np.linalg.inv(hessian)
final_param_updates =, sd_param_updates)
return final_param_updates
if __name__ == "__main__":
# Load image
reference = cv2.imread('test.png')
reference = cv2.cvtColor(reference, cv2.COLOR_BGR2GRAY)
# Generate template as small block from within reference image using homography
# 'h' is the ground truth homography for warping reference image onto template image
template_size = (100, 100)
h = np.float32([[1, 0, -100],[0, 1, -100],[0, 0, 1]])
h_ground_truth = h.copy()
template = cv2.warpPerspective(reference, h, template_size)
# Convert template corner points to reference image coordinate plane
template_corners = np.array([[0, 0],[0, 100],[100, 100],[100, 0]])
h_inverse = np.linalg.inv(h)
reference_corners = cv2.perspectiveTransform(np.array([template_corners], dtype='float32'), h_inverse)
# Small perturbation to ground truth homography
h_mod = np.random.uniform(low=-1.0, high=1.0, size=(h.shape))
h_mod = np.array([[1, 1, 1],[1, 1, 1],[1, 1, 1]])
h_mod[0, 0] = h_mod[0, 0] * 0
h_mod[0, 1] = -h_mod[0, 1] * 0
h_mod[0, 2] = h_mod[0, 2] * 10
h_mod[1, 0] = h_mod[1, 0] * 0
h_mod[1, 1] = h_mod[1, 1] * 0
h_mod[1, 2] = h_mod[1, 2] * 10
h_mod[2, 0] = h_mod[2, 0] * 0
h_mod[2, 1] = h_mod[2, 1] * 0
h_mod[2, 2] = h_mod[2, 1] * 0
h = h + h_mod
# Warp reference image to template image based on initial perturbed homography
reference_transformed = cv2.warpPerspective(reference, h, template_size)
# ##############################
# Lucas-Kanade algorithm below
# This is supposed to calculate the homography that undoes the small perturbation
# and returns a homography as close as possible to the ground truth homography
# ##############################
# Precompute image gradients
grad_x = cv2.Sobel(reference,cv2.CV_64F,1,0,ksize=1)
grad_y = cv2.Sobel(reference,cv2.CV_64F,0,1,ksize=1)
# Loop algorithm for given # of steps
for i in range(1000):
# Step 1
# Warp reference image onto coordinate frame of template
reference_transformed = cv2.warpPerspective(reference, h, template_size)
# Step 2
# Compute error image
img_error = template - reference_transformed
# fig_overlay = plt.figure()
# ax1 = fig_overlay.add_subplot(1,3,1)
# plt.imshow(img_warped)
# ax2 = fig_overlay.add_subplot(1,3,2)
# plt.imshow(template)
# ax3 = fig_overlay.add_subplot(1,3,3)
# plt.imshow(img_error)
# Step 3
# Warp the gradients
grad_x_warped = cv2.warpPerspective(grad_x, h, template_size)
grad_y_warped = cv2.warpPerspective(grad_y, h, template_size)
# Step 4 & 5
# Use Jacobian of warp to calculate steepest descent images
steepest_descent = calculate_steepest_descent(grad_x_warped, grad_y_warped, h)
# fig_overlay = plt.figure()
# ax1 = fig_overlay.add_subplot(1,8,1)
# plt.imshow(steepest_descent[:, :, 0])
# ax2 = fig_overlay.add_subplot(1,8,2)
# plt.imshow(steepest_descent[:, :, 1])
# ax3 = fig_overlay.add_subplot(1,8,3)
# plt.imshow(steepest_descent[:, :, 2])
# ax4 = fig_overlay.add_subplot(1,8,4)
# plt.imshow(steepest_descent[:, :, 3])
# ax5 = fig_overlay.add_subplot(1,8,5)
# plt.imshow(steepest_descent[:, :, 4])
# ax6 = fig_overlay.add_subplot(1,8,6)
# plt.imshow(steepest_descent[:, :, 5])
# ax7 = fig_overlay.add_subplot(1,8,7)
# plt.imshow(steepest_descent[:, :, 6])
# ax8 = fig_overlay.add_subplot(1,8,8)
# plt.imshow(steepest_descent[:, :, 7])
# Step 6
# Compute Hessian matrix
hessian = calculate_hessian(steepest_descent)
# Step 7
# Compute steepest descent parameter updates by
# dot producting error image with steepest descent images
sd_param_updates = calculate_sd_param_updates(steepest_descent, img_error)
# Step 8
# Compute final parameter updates
final_param_updates = calculate_final_param_updates(sd_param_updates, hessian)
# Step 9
# Update the parameters
h = h.reshape(-1,1)
h[:-1] += final_param_updates
h = h.reshape(3,3)
# Step 10
# Calculate norm of parameter updates
final_param_update_norm = np.linalg.norm(final_param_updates)
print("Final Param Norm: {}".format(final_param_update_norm))
reference_transformed = cv2.warpPerspective(reference, h, template_size)
cv2.imwrite('warps/warp_{}.png'.format(i), reference_transformed)
# Warp source image to destination based on homography
reference_transformed = cv2.warpPerspective(reference, h, template_size)
cv2.imwrite('final_warp.png', reference_transformed)
It should just need a reference image to test with.
The expected result is that the algorithm converges to a homography that matches the ground truth homography I calculate in the code, but the loss just seems to explode instead and I end up with a totally incorrect homography.

This should be a comment since I am not certain is the full cause of your problem
But it might be part of it
To solve a system of linear equations don't compute the inverse
hessian_inverse = np.linalg.inv(hessian)
and then multiply by it
final_param_updates =, sd_param_updates)
This is both wasteful and can cause more numerical instability than solving systems of linear equations normally have.
Instead use the method solve.
Computing the inverse will repeat some of the operations needed to do solve for each of the columns of the identity matrix. None of those operations is needed.


What am I doing wrong with Affine Transform of Parallelogram into Rectangle?

I have two shapes, a rectangle and a parallelogram that signify two gantry systems. The one gantry system has a camera on it and can detect the position of the other gantry system as it sits above. I cannot via a series of transforms (translate, rotate, shear x, shear y, translate) get it even remotely close to fitting to the system 1. Could I please get some pointers/insight as to what I am doing wrong?
I've tested each transform with a unit vector so I know the math works. I suspect either I am using the incorrect angles(using the same on the unit vectors though), there are linearity issues where it is not quite linear and therefore transforms wont work (this also seems unlikely due to the physical nature), or most likely my order of operations are incorrect.
from matplotlib import pyplot as plt
import numpy as np
from mpl_toolkits.axes_grid1.inset_locator import TransformedBbox, BboxPatch, BboxConnector
def get_angle(array, array_2, side=3):
if side == 0:
# Get start and end points from array
vector = array[1] - array[0]
# Get start and end points from array
vector_2 = array_2[1] - array_2[0]
elif side == 1:
# Get start and end points from array
vector = array[2] - array[1]
# Get start and end points from array
vector_2 = array_2[2] - array_2[1]
elif side == 2:
# Get start and end points from array
vector = array[2] - array[3]
# Get start and end points from array
vector_2 = array_2[2] - array_2[3]
elif side == 3:
# Get start and end points from array
vector = array[3] - array[0]
# Get start and end points from array
vector_2 = array_2[3] - array_2[0]
# Calculate unit vectors
dot = vector[0] * vector_2[0] + vector[1] * vector_2[1] # dot product between [x1, y1] and [x2, y2]
det = vector[0] * vector_2[1] - vector[1] * vector_2[0] # determinant
angle = np.arctan2(det, dot) # atan2(y, x) or atan2(sin, cos)
return angle
def shear_trans_x(coords, phi):
shear_x = np.array([[1, np.tan(phi), 0],
[0, 1, 0],
[0, 0, 1]])
coords = np.append(coords, np.ones((coords.shape[0], 1)), axis=1)
resultant = coords # shear_x.T
return resultant[:, 0:2]
def shear_trans_y(coords, psi):
shear_y = np.array([[1, 0, 0],
[np.tan(psi), 1, 0],
[0, 0, 1]])
coords = np.append(coords, np.ones((coords.shape[0], 1)), axis=1)
resultant = coords # shear_y.T
return resultant[:, 0:2]
def translate(coordinates, offset):
coordinates = np.append(coordinates, np.ones((coordinates.shape[0], 1)), axis=1)
a = np.array([[1, 0, offset[0]],
[0, 1, offset[1]],
[0, 0, 1 ]])
result = coordinates # a.T
return result[:, 0:2]
def rotate(coords, theta, origin=[0,0]):
cos = np.cos(theta)
sin = np.sin(theta)
a = np.array([[cos, -sin, 0],
[sin, cos, 0],
[0, 0, 1]])
if np.all(origin == [0, 0]):
coords = np.append(coords, np.ones((coords.shape[0], 1)), axis=1)
result = coords # a.T
return result[:, 0:2]
coords = translate(coords, -origin)
coords = rotate(coords, theta, origin=[0, 0])
coords = translate(coords, origin)
return coords
def mark_inset(parent_axes, inset_axes, loc1a=1, loc1b=1, loc2a=2, loc2b=2, **kwargs):
draw a bbox of the region of the inset axes in the parent axes and
connecting lines between the bbox and the inset axes area
loc1, loc2 : {1, 2, 3, 4}
rect = TransformedBbox(inset_axes.viewLim, parent_axes.transData)
p1 = BboxConnector(inset_axes.bbox, rect, loc1=loc1a, loc2=loc1b, **kwargs)
p2 = BboxConnector(inset_axes.bbox, rect, loc1=loc2a, loc2=loc2b, **kwargs)
pp = BboxPatch(rect, fill=False, **kwargs)
return pp, p1, p2
if __name__ == '__main__':
# calibration data
gantry_1_coords = np.array([[169.474, 74.4851], [629.474, 74.4851], [629.474, 334.4851], [169.474, 334.4851]])
gantry_2_coords_error = np.array([[-0.04, 0.04], [-0.04, -0.31], [0.76, -0.57], [1.03, 0.22]])
# gantry_2_coords_error = np.array([[0.13, 0.04], [-0.13, -0.75], [0.31, -0.93], [0.58, -0.31]])
# add error to gantry 1 coords
gantry_2_coords = gantry_1_coords + gantry_2_coords_error
# append first point to end for plotting to display a closed box
gantry_1_coords = np.append(gantry_1_coords, np.array([gantry_1_coords[0]]), axis=0)
gantry_2_coords = np.append(gantry_2_coords, np.array([gantry_2_coords[0]]), axis=0)
# get length of diagonal direction
magnitude = np.linalg.norm(gantry_1_coords[0] - gantry_1_coords[2])
magnitude_gantry_2 = np.linalg.norm(gantry_2_coords[0] - gantry_2_coords[2])
# translate to gantry_1 first position
translated_gantry_2 = translate(gantry_2_coords, (gantry_1_coords[0] - gantry_2_coords[0]))
print('translation_offset_1', ' = ', gantry_1_coords[0] - gantry_2_coords[0])
# rotate gantry_2 to gantry_1
theta = get_angle(translated_gantry_2, gantry_1_coords, side=0)
rotate_gantry_2_coords = rotate(translated_gantry_2, theta, translated_gantry_2[0])
print('rotation angle', ' = ', theta)
# un-shear x axis gantry_2
shear_phi = get_angle(rotate_gantry_2_coords, gantry_1_coords, side=3)
sheared_x_gantry_2 = shear_trans_x(rotate_gantry_2_coords, shear_phi)
print('shear x angle', ' = ', shear_phi)
# un-shear y axis gantry_2
shear_psi = get_angle(sheared_x_gantry_2, gantry_1_coords, side=2)
sheared_y_gantry_2 = shear_trans_y(sheared_x_gantry_2, shear_psi)
print('shear y angle', ' = ', shear_psi)
# translate to gantry_1 first position
final_gantry_2_coords = translate(sheared_y_gantry_2, (gantry_1_coords[0] - sheared_y_gantry_2[0]))
print('translation_offset_2', ' = ', gantry_1_coords[0] - sheared_y_gantry_2[0])
# create exaggerated errors for plotting
ex_gantry_2_coords = (gantry_2_coords - gantry_1_coords) * 50 + gantry_2_coords
ex_gantry_2_final_coords = (final_gantry_2_coords - gantry_1_coords) * 50 + final_gantry_2_coords
# separate out x & y components for plotting
gantry_1_x, gantry_1_y = gantry_1_coords.T
gantry_2_x, gantry_2_y = ex_gantry_2_coords.T
gantry_2_final_x, gantry_2_final_y = ex_gantry_2_final_coords.T
# plot results
fig, ax = plt.subplots()
ax.plot(gantry_1_x, gantry_1_y, color='black', linestyle='--', label='gantry_1')
ax.plot(gantry_2_x, gantry_2_y, color='blue', linestyle='--', label='gantry_2 original')
ax.plot(gantry_2_final_x, gantry_2_final_y, color='red', linestyle='--', label='gantry_2 transformed')
# get legend lines and labels from center graph
lines, labels = ax.get_legend_handles_labels()
fig.legend(lines, labels)
# print('gantry 1 positions: ', gantry_1_coords)
# print('transformed gantry 2 positions: ', final_gantry_2_coords)
Fixing existing code
In terms of the existing code, I applied the transformations one by one, and I think you're missing a negative sign here:
sheared_x_gantry_2 = shear_trans_x(rotate_gantry_2_coords, -shear_phi)
# ^--- here
After applying that, the graph looks better:
Least squares fit
However, I think this is the wrong general approach. For example, when you fix the shear, that's going to break the translation and rotation, at least a little bit. You can repeatedly apply the fixes, and converge on the right answer, but there's a better way.
Instead, I would suggest finding a least-squares fit for the transformation matrix, rather than building up a bunch of rotation and shear matrices. Numpy has a function that will do this.
def add_bias_term(matrix):
return np.append(np.ones((matrix.shape[0], 1)), matrix, axis=1)
x, _, _, _ = np.linalg.lstsq(add_bias_term(gantry_2_coords), gantry_1_coords, rcond=None)
final_gantry_2_coords = add_bias_term(gantry_2_coords) # x
This is both a heck of a lot shorter, and produces a better fit to boot:
And here is the matrix that it finds:
array([[ 0.19213806, -0.37107611],
[ 1.00028902, 0.00123954],
[-0.00359818, 1.00014869]])
(Note that the first row is the bias term.)
Although, the fit is not perfect. Here are the residuals:
array([[-0.06704727, -0.10997465], # point 0
[ 0.06716097, 0.11016114], # point 1
[-0.06720015, -0.1102254 ], # point 2
[ 0.06708645, 0.11003891]]) # point 3
Unfortunately, this remaining error is nonlinear, by definition. (If there were an affine matrix which reduced the error better, lstsq would have found it.)
Adding nonlinearity
Eyeballing the residuals, the error goes in one direction when both x and y are large, and in the other direction when only one of x or y are large. That suggests to me that you need an interaction term. In other words, you need to preprocess the input matrix so that it has a column with X, a column with Y, and a column with X*Y.
The code to do that looks like this:
def add_bias_term(matrix):
return np.append(np.ones((matrix.shape[0], 1)), matrix, axis=1)
def add_interaction(matrix):
inter = (matrix[:, 0] * matrix[:, 1]).reshape(matrix.shape[0], 1)
return np.append(inter, matrix, axis=1)
x, _, _, _ = np.linalg.lstsq(add_bias_term(add_interaction(gantry_2_coords)), gantry_1_coords, rcond=None)
final_gantry_2_coords = (add_bias_term(add_interaction(gantry_2_coords)) # x)
And the graph for that looks like this:
And that's close enough that the two graphs are right on top of each other.

Random erasing at multiple random regions of image

I want to implement a custom random erasing function.
This function would take an input image and a percentage to mask, but would then mask between 1 and 4 random rectangles whose total area adds up to the mask percentage.
For example, say my image is 100100 pixels, and my mask percent is 15% so I randomly choose to create 3 rectangles with random shapes such that their combined area sums up to 100100*0.15 pixels.
so far i managed to write the code that decides upon the width and height and amount of rectangles, but i struggle with the part that makes sure they don't mask the same spot.
img_c, img_h, img_w = img.shape[-3], img.shape[-2], img.shape[-1]
area = img_h * img_w
for _ in range(10):
block_num = torch.randint(1,4,(1,)).item()
block_sizes = torch.rand((block_num))
block_sizes = torch.round((block_sizes / block_sizes.sum()) * (area * mask_percent))
h = torch.round((torch.rand(block_num)+0.5) * block_sizes.sqrt())
w = torch.round(block_sizes / h)
xs = []
ys = []
if not (any(h < img_h) and any(w < img_w)):
term = True
while term:
xs = [torch.randint(0, img_h - h_ + 1, size=(1, )).item() for h_ in h]
ys = [torch.randint(0, img_w - w_ + 1, size=(1, )).item() for w_ in w]
for iter,x in enumerate(xs):
if (x+h[iter]-xs)<0
#here i get all confused. should have a loop that goes over each point and checks that the location + axial size
#doesn't go over another point. it's confusing because should i also take care vice versa? maybe someone knows of a ready made solution?
return i, j, h, w, v
# Return original image
return 0, 0, img_h, img_w, img
the while loop is released once the random location generator generates locations that corresppond to the terms.
my latest attempt seems to work, but always exits the loop unsolved! is it just not a very likely set of parameters?
img = torch.rand(1,160,1024)
img_c, img_h, img_w = img.shape[-3], img.shape[-2], img.shape[-1]
area = img_h * img_w
for _ in range(100):
block_num = torch.randint(1,3,(1,)).item()
block_sizes = torch.rand((block_num))
block_sizes = torch.round((block_sizes / block_sizes.sum()) * (area * 0.15))
h = torch.round((torch.rand(block_num)+0.5) * block_sizes.sqrt()).to(dtype=torch.long)
w = torch.round(block_sizes / h).to(dtype=torch.long)
if (h > img_h).any() or (w > img_w).any():
overlap1 = torch.zeros(img.shape)
xs = [torch.randint(0, img_h - h_.item() + 1, size=(1, )).item() for h_ in h]
ys = [torch.randint(0, img_w - w_.item() + 1, size=(1, )).item() for w_ in w]
for iter,(x,y) in enumerate(zip(xs,ys)):
overlap1[0,x:x+h[iter],y:y+w[iter]] += 1
if (overlap1>1).any():
When you are checking that rectangle2 doesn't overlap with rectangle1, just check their intersection area. If intersection area is greater than 0, reject rectangle2 and check next random rectangle. Repeat the process with new rectangle.
For checking intersection - use sklearn's Jaccard score (avoid reinventing the wheel). To be able to use Jaccard for comparison, the two arrays (images) should be of same size. So generate original image size equivalent masks (mask1 and mask2) from your rectangle1 and rectangle2 respectively and then calculate Jaccard of mask1 and mask2.
import numpy as np
from sklearn.metrics import jaccard_score
mask1 = np.array([[0, 1, 1],
[1, 1, 0]])
mask2 = np.array([[1, 1, 1],
[1, 0, 0]])
jaccard_score(mask1, mask2)

Deprojecting depth onto original mesh

I am trying to get blender render depth map of an object and then moving it to overlay the original object. Currently I have no issue with rendering the object and extracting it into it's place.
However, I am stuck when trying to position the object into it's original position.
I'm trying to apply inverse camera world matrix to the rendered pointcloud (in blue). Unfortunately, when I apply said camera inverse it doesn't appear nowhere near where I'd expect (in red).
I have attached the entirety of code that I have to replicate this behaviour. I would appreciate it if someone would point me to the right matrix that I should be multiplying the point cloud by.
from mathutils import Vector, Quaternion, Euler, Matrix
import numpy as np
import bpy
def main_script():
tmp_path = "/tmp/tmp_render.exr"
scene = get_scene("Scene")
camera = create_camera("Camera")
camera.rotation_euler = Euler([np.pi * 0.5, 0, np.pi * 0.5], "XYZ")
camera.location = Vector([4.5, 0, 1])
location=(0, 0, 1), rotation=(0, 0, np.pi*0.5), size=1.0)
_w, _h = 640, 480
init_rendering(scene, camera, width=640, height=480)
matrix_K = get_calibration_matrix_K_from_blender(scene,
_fy, _fx = matrix_K[0][0], matrix_K[1][1]
_cy, _cx = matrix_K[0][2], matrix_K[1][2]
scene.render.filepath = tmp_path
depth = read_exr(tmp_path, "R")["R"]
depth = np.reshape(convert_to_numpy(depth), [_h, _w])
exr_cloud = depth_to_cloud(
_w, _h, _fx, _fy, _cx, _cy, depth)
exr_cloud = np.reshape(exr_cloud, [-1, 3])
exr_cloud = exr_cloud[(exr_cloud[..., 2] < 100) & (exr_cloud[..., 2] > 0)]
matrix = np.reshape(camera.matrix_world, [4, 4])
matrix = np.linalg.inv(matrix) # why doesn't this place the depth properly
vertices = np.ones([exr_cloud.shape[0], 4], dtype=np.float32)
vertices[:, 0:3] = exr_cloud
vertices = np.array(
[matrix # vertex for vertex in vertices], dtype=np.float32)
vertices = vertices[..., :3]
create_mesh("Suzanne_EXR", exr_cloud, [])
create_mesh("SuzanneT_EXR", vertices, [])
utilities methods required to run the script
def clear_scene():
for scene in
for obj in scene.objects:
def read_exr(path, channels):
import OpenEXR as _OpenEXR
import Imath as _Imath
file = _OpenEXR.InputFile(path)
FLOAT = _Imath.PixelType(_Imath.PixelType.FLOAT)
results = {}
for ch in channels:
results[ch] =, FLOAT)
return results
def convert_to_numpy(data):
import array as _array
return np.array(_array.array("f", data).tolist())
def update_scene():
dg = bpy.context.evaluated_depsgraph_get()
def prepare_views():
preferences = bpy.context.preferences
preferences.view.show_tooltips_python = True
preferences.view.show_developer_ui = True
preferences.view.render_display_type = "NONE"
def init_rendering(scene, camera, width=None, height=None):
def set_rendering_settings(camera, scene, width=640, height=480):
image_settings = scene.render.image_settings
image_settings.file_format = "OPEN_EXR"
image_settings.use_zbuffer = True
scene.render.resolution_x = width
scene.render.resolution_y = height
# scene.render.use_antialiasing = False
scene.use_nodes = True = camera
node_tree = scene.node_tree
nodes = node_tree.nodes
node_render_layers = nodes["Render Layers"]
node_composite = nodes["Composite"]
node_render_layers.outputs["Depth"], node_composite.inputs["Image"])
set_rendering_settings(camera, scene)
def get_scene(name): return[name]
def create_camera(name):
camera =
camera.lens = 50
obj =, camera)
return obj
# ---------------------------------------------------------------
# 3x4 P matrix from Blender camera
# ---------------------------------------------------------------
# Build intrinsic camera parameters from Blender camera data
# See notes on this in
def get_calibration_matrix_K_from_blender(scene, camera):
from mathutils import Matrix as _Matrix
f_in_mm = camera.lens
resolution_x_in_px = scene.render.resolution_x
resolution_y_in_px = scene.render.resolution_y
scale = scene.render.resolution_percentage / 100
sensor_width_in_mm = camera.sensor_width
sensor_height_in_mm = camera.sensor_height
pixel_aspect_ratio = scene.render.pixel_aspect_x / scene.render.pixel_aspect_y
if (camera.sensor_fit == 'VERTICAL'):
# the sensor height is fixed (sensor fit is horizontal),
# the sensor width is effectively changed with the pixel aspect ratio
s_u = resolution_x_in_px * scale / sensor_width_in_mm / pixel_aspect_ratio
s_v = resolution_y_in_px * scale / sensor_height_in_mm
else: # 'HORIZONTAL' and 'AUTO'
# the sensor width is fixed (sensor fit is horizontal),
# the sensor height is effectively changed with the pixel aspect ratio
pixel_aspect_ratio = scene.render.pixel_aspect_x / scene.render.pixel_aspect_y
s_u = resolution_x_in_px * scale / sensor_width_in_mm
s_v = resolution_y_in_px * scale * pixel_aspect_ratio / sensor_height_in_mm
# Parameters of intrinsic calibration matrix K
alpha_u = f_in_mm * s_u
alpha_v = f_in_mm * s_v
u_0 = resolution_x_in_px * scale / 2
v_0 = resolution_y_in_px * scale / 2
skew = 0 # only use rectangular pixels
K = _Matrix(
((alpha_u, skew, u_0),
(0, alpha_v, v_0),
(0, 0, 1)))
return K
def create_mesh(name, vertices, faces):
import bmesh as _bmesh
mesh ="Mesh_%s" % name)
mesh.from_pydata(vertices, [], faces)
obj =, mesh)
bm =
return obj
def depth_to_cloud(w, h, fx, fy, cx, cy, depth):
from numpy import concatenate as _concat
from numpy import indices as _indices
from numpy import newaxis as _newaxis
indices = _indices(depth.shape)
indices_y, indices_x = indices
ys, xs, zs = \
(indices_y - cy) * depth / fy, \
(indices_x - cx) * depth / fx, \
points = _concat([xs[..., _newaxis], ys[..., _newaxis],
zs[..., _newaxis]], axis=2)
return points
if __name__ == "__main__":
raise main_script()
The problem was compound, first I needed to replace my transformed vertex calculation from instead using inverse camera world matrix, to negatively scaled camera world matrix like so
matrix_cam = np.reshape(camera.matrix_world, [4, 4])
mat_scale = np.array(Matrix.Scale(-1, 4))
matrix = matrix_cam # mat_scale
vertices = np.ones([exr_cloud.shape[0], 4], dtype=np.float32)
vertices[:, 0:3] = exr_cloud
vertices = np.array(
[matrix # vertex for vertex in vertices], dtype=np.float32)
vertices = vertices[..., :3]
Additionally, there was an issue with depth decoding which caused the point cloud to be deformed, fixed like so
ys, xs, zs = \
(indices_y - cx) * depth / fx, \
(indices_x - cy) * depth / fy, \

Implementing a bilateral filter

I am trying to implement a bilateral filter from the paper Fast Bilateral Filteringfor the Display of High-Dynamic-Range Images. The equation (from the paper) that implements the bilateral filter is given as :
According to what I understood,
f is a Gaussian filter
g is a Gaussian filter
p is a pixel in a given image window
s is the current pixel
Ip is the intensity at the current pixel
With this, I wrote the code to implement these equations, given as :
import cv2
import numpy as np
img = cv2.imread("fish.png")
# image of width 239 and height 200
bl_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
i = cv2.magnitude(
cv2.Sobel(bl_img, cv2.CV_64F, 1, 0, ksize=3),
cv2.Sobel(bl_img, cv2.CV_64F, 0, 1, ksize=3)
f = cv2.getGaussianKernel(5, 0.1, cv2.CV_64F)
g = cv2.getGaussianKernel(5, 0.1, cv2.CV_64F)
rows, cols, _ = img.shape
filtered = np.zeros(img.shape, dtype=img.dtype)
for r in range(rows):
for c in range(cols):
ks = []
for index in [-2,-1,1,2]:
if index + c > 0 and index + c < cols-1:
p = img[r][index + c]
s = img[r][c]
i_p = i[index+c]
i_s = i[c]
(f * (p-s)) * (g * (i_p * i_s)) # EQUATION 7
ks = np.sum(np.array(ks))
js = []
for index in [-2, -1, 1, 2]:
if index + c > 0 and index + c < cols -1:
p = img[r][index + c]
s = img[r][c]
i_p = i[index+c]
i_s = i[c]
js.append((f * (p-s)) * (g * (i_p * i_s)) * i_p) # EQUATION 6
js = np.sum(np.asarray(js))
js = js / ks
filtered[r][c] = js
cv2.imwrite("f.png", filtered)
But as I run this code I get an error saying:
Traceback (most recent call last):
File "", line 33, in <module>
(f * (p-s)) * (g * (i_p * i_s))
ValueError: operands could not be broadcast together with shapes (5,3) (5,239)
Did I incorrectly implement the equations? What am I missing?
There are various issues with your code. Foremost, the equation is interpreted in a wrong way. f(p-s) means evaluating the function f at p-s. f is the Gaussian. Likewise with g. The section of the code would look like this:
weight = gaussian(p - s, sigma_f) * gaussian(i_p - i_s, sigma_g)
js.append(weight * i_p)
Note that the two loops can be merged, this way you avoid some duplicated computation. gaussian(x, sigma) would be a function that computes the Gaussian weight at x. You need to define two sigmas, sigma_f and sigma_g, the spatial and the tonal sigma respectively.
The second issue is in the definition of p and s. These are the coordinates of the pixel, not the value of the image at the pixel. i_p and i_s are the value of the image at those locations. p-s is basically the spatial distance between the pixel at (r,c) and the given neighbor.
The third issue is the loop over the neighborhood. The neighborhood is all pixels where gaussian(p - s, sigma_f) is not negligible. So how large the neighborhood is depends on the chosen sigma_f. You should take it at least to be ceil(2*sigma_f). Say sigma_f is 2, then you want the neighborhood to go from -4 to 4 (9 pixels). But this neighborhood is two dimensional, not one-dimensional as in your code. So you need two loops:
for ii in range(-ceil(2*sigma_f), ceil(2*sigma_f)+1):
if ii + c > 0 and ii + c < cols-1:
for jj in range(-ceil(2*sigma_f), ceil(2*sigma_f)+1):
if jj + r > 0 and jj + r < rows-1:
# compute weight here
Note that now, p-s is computed with math.sqrt(ii**2 + jj**2). But also note that the Gaussian uses x**2, so you could skip the computation of the square root by passing x**2 into your gaussian function.

Tensorflow - Get Neighborhood of Pixel

I'm trying to implement the loss function of the classic Image Colorization paper by Levin et al (2004) in Tensorflow/Keras:
This is the weights equation (correlation between intensities):
y is every neighboring pixel of x in a 3x3 window and w is the weight for each of these pixels.
The weights require computing the mean and variance for the neighborhood of every pixel.
I couldn't find a function that would allow me to write this loss function in a symbolic way, and I'm thinking I should write it in a loop where I calculate the w for each window.
How can I write this Loss function in Tensorflow In a Symbolic way or in loops?
Thanks so much.
EDIT: Here's the code I've come up for calculating the weights in Numpy:
import cv2
import numpy as np
im = cv2.resize(cv2.imread('./Image.jpg', 0), (256, 256)) / np.float32(255.0)
M = 3
N = 3
# Split the image into 3x3 windows
windows = [im[x:x + M, y:y + N] for x in range(0, im.shape[0], M) for y in range(0, im.shape[1], N)]
# Calculate the correlation for each window
weights = [1 + np.corrcoef(tile) for tile in windows]
I think this code computes the value in your formula:
import tensorflow as tf
from itertools import product
SIGMA = 1.0
dtype = tf.float32
# Input images batch
img = tf.placeholder(dtype, [None, None, None])
img_shape = tf.shape(img)
img_height = img_shape[1]
img_width = img_shape[2]
# Compute 3 x 3 block means
mean_filter = tf.ones((3, 3), dtype) / 9
img_mean = tf.nn.conv2d(img[:, :, :, tf.newaxis],
mean_filter[:, :, tf.newaxis, tf.newaxis],
[1, 1, 1, 1], 'VALID')[:, :, :, 0]
# Remove 1px border
img_clip = img[:, 1:-1, 1:-1]
# Difference between pixel intensity and its block mean
x_diff = img_clip - img_mean
# Compute neighboring pixel loss contributions
contributions = []
for i, j in product((-1, 0, 1), repeat=2):
if i == j == 0: continue
# Take "shifted" image
displaced_img = img[:, 1 + i:img_width - 1 + i, 1 + j:img_height - 1 + j]
# Compute difference with mean of corresponding pixel block
y_diff = displaced_img - img_mean
# Weights formula
weight = 1 + x_diff * y_diff / (SIGMA ** 2)
# Contribution of this displaced image to the loss of each pixel
contribution = weight * displaced_img
contributions = tf.add_n(contributions)
# Compute loss value
loss = tf.reduce_sum(tf.squared_difference(img_clip, contributions))
The loss for the pixels along the image border is not computed, since in principle is not well defined in the formula, although you could make a few changes to take them into account if you want (change convolution to "'SAME'", pad where necessary, etc.).
this is a mean squared error of a 3 x 3 windows. right?
sounds like a GLCM matrix for texture analysis do you want apply this loss function for every 3x3 windows in the image?
I think that is better build the function that make this calculation with a Random weight in Numpy so after try build with TF to try a optimization.
