Fitting an Orthogonal Grid to Noisy Coordinates - python

Problem
I have a list of coordinates that are meant to form a grid. Each coordinate has a random error component and some of the coordinates are missing. Grid could be rotated (update). I want to fit a orthogonal grid to the data points and return a list of the grid's vertices. For example:
Application
The purpose is to find a grid in a scanned image. The data points come from the results of contour or edge detection in OpenCV. An example is image with a grid of photos.
Goal
I wrote some Python code that works, but would like to find a linear algebra algorithm using SciPy, statsmodels or other modules that would be more robust and handle a small rotation of the grid (less than 10°).
Python Code Using Lists Only
# Noisy [x, y] coordinates (origin is upper-left corner)
pts = [[103,101],
[198,103],
[300, 99],
[ 97,205],
[304,202],
[102,295],
[200,303],
[104,405],
[205,394],
[298,401]]
def row_col_avgs(num_list, ratio):
# Finds the average of each row and column. Coordinates are
# assigned to a row and column by specifying an error ratio.
last_num, sum_nums, count_nums, avgs = 0, 0, 0, []
num_list.sort()
for num in num_list:
# Calculate average for last row or column and begin new row or column
if num > (1+ratio)*last_num and count_nums != 0:
avgs.append(int(round(sum_nums/count_nums,0)))
sum_nums = num
count_nums = 1
# Or continue with current row or column
else:
sum_nums += num
count_nums += 1
last_num = num
avgs.append(int(round(sum_nums/count_nums,0)))
return avgs
# Split coordinates into two lists of x's and y's
xs, ys = map(list, zip(*pts))
# Find averages of each row and column of the grid
x_avgs = row_col_avgs(xs, 0.1)
y_avgs = row_col_avgs(ys, 0.1)
# Return vertices of completed averaged grid
avg_grid = []
for y_avg in y_avgs:
avg_row = []
for x_avg in x_avgs:
avg_row.append([int(x_avg), int(y_avg)])
avg_grid.append(avg_row)
print(avg_grid)
Output
[[[102, 101], [201, 101], [301, 101]],
[[102, 204], [201, 204], [301, 204]],
[[102, 299], [201, 299], [301, 299]],
[[102, 400], [201, 400], [301, 400]]]

Parallel Slopes Ordinary Least Squares (OLS) Model:
y = mx + grp + b where m=slope, b=y-intercept, & grp=categorical variable.
This is an alternative algorithm that can handle a rotated grid.
The OLS model includes both the data points in the original orientation
and a 90° rotation of the same data points. This is necessary so all gridlines are parallel and have the same slope.
Algorithm:
Find a reference gridline to compare with remaining points by choosing two neighboring points in the first or last row with a slope closest to zero.
Calculate the distances between this reference line and the remaining points.
Segment points into groups w.r.t. the calculated distances (one group per gridline).
Repeat steps 1 to 3 for the 90 degree rotated grid and combine results.
Create a parallel slopes OLS model to determine linear equations for the gridlines.
Rotate the rotated gridlines back to their original orientation.
Calculate the intersection points.
Note: Fails if noise, angle and/or missing data are too much.
Example:
                
Python Code to Create Example
def create_random_example():
# Requires import of numpy and random packages
# Creates grid with random noise and missing points
# Example will fail if std_dev, rotation, pct_removed too large
# Parameters
first_row, last_row = 100, 900
first_col, last_col = 100, 600
num_rows = 6
num_cols = 4
rotation = 3 # degrees that grid is rotated
sd = 3 # percent std dev of avg x and avg y coordinates
pct_remove = 30 # percent of points to randomly remove from data
# Create grid
x = np.linspace(first_col, last_col, num_cols)
y = np.linspace(first_row, last_row, num_rows)
xx, yy = np.meshgrid(x, y)
# Add noise
x = xx.flatten() + sd * np.mean(xx) * np.random.randn(xx.size) / 100
y = yy.flatten() + sd * np.mean(yy) * np.random.randn(yy.size) / 100
# Randomly remove points
random_list = random.sample(range(0, num_cols*num_rows),
int(pct_remove*num_cols*num_rows/100))
x, y = np.delete(x, random_list), np.delete(y, random_list)
pts = np.column_stack((x, y))
# Rotate points
radians = np.radians(rotation)
rot_mat = np.array([[np.cos(radians),-np.sin(radians)],
[np.sin(radians), np.cos(radians)]])
einsum = np.einsum('ji, mni -> jmn', rot_mat, [pts])
pts = np.squeeze(einsum).T
return np.rint(pts)
Python Code to Fit Gridlines
import numpy as np
import pandas as pd
import itertools
import math
import random
from statsmodels.formula.api import ols
from scipy.spatial import KDTree
import matplotlib.pyplot as plt
def pt_line_dist(pt, ref_line):
pt1, pt2 = [ref_line[:2], ref_line[2:]]
# Distance from point to line defined by two other points
return np.linalg.norm(np.cross(pt1 - pt2, [pt[0],pt[1]])) \
/ np.linalg.norm(pt1 - pt2)
def segment_pts(amts, grp_var, grp_label):
# Segment on amounts (distances here) in last column of array
# Note: need to label groups with string for OLS model
amts = amts[amts[:, -1].argsort()]
first_amt_in_grp = amts[0][-1]
group, groups, grp = [], [], 0
for amt in amts:
if amt[-1] - first_amt_in_grp > grp_var:
groups.append(group)
first_amt_in_grp = amt[-1]
group = []; grp += 1
group.append(np.append(amt[:-1],[[grp_label + str(grp)]]))
groups.append(group)
return groups
def find_reference_line(pts):
# Find point with minimum absolute slope relative both min y and max y
y = np.hsplit(pts, 2)[1] # y column of array
m = []
for i, y_pt in enumerate([ pts[np.argmin(y)], pts[np.argmax(y)] ]):
m.append(np.zeros((pts.shape[0]-1, 5))) # dtype default is float64
m[i][:,2:4] = np.delete(pts, np.where((pts==y_pt).all(axis=1))[0], axis=0)
m[i][:,4] = abs( (m[i][:,3]-y_pt[1]) / (m[i][:,2]-y_pt[0]) )
m[i][:,:2] = y_pt
m = np.vstack((m[0], m[1]))
return m[np.argmin(m[:,4]), :4]
# Ignore division by zero (slopes of vertical lines)
np.seterr(divide='ignore')
# Create dataset and plot
pts = create_random_example()
plt.scatter(pts[:,0], pts[:,1], c='r') # plot now because pts array changes
# Average distance to the nearest neighbor of each point
tree = KDTree(pts)
nn_avg_dist = np.mean(tree.query(pts, 2)[0][:, 1])
# Find groups of points representing each gridline
groups = []
for orientation in ['o', 'r']: # original and rotated orientations
# Rotate points 90 degrees (note: this moves pts to 2nd quadrant)
if orientation == 'r':
pts[:,1] = -1 * pts[:,1]
pts[:, [1, 0]] = pts[:, [0, 1]]
# Find reference line to compare remaining points for grouping
ref_line = find_reference_line(pts) # line is defined by two points
# Distances between points and reference line
pt_dists = np.zeros((pts.shape[0], 3))
pt_dists[:,:2] = pts
pt_dists[:,2] = np.apply_along_axis(pt_line_dist, 1, pts, ref_line).T
# Segment pts into groups w.r.t. distances (one group per gridline)
# Groups have range less than nn_avg_dist.
groups += segment_pts(pt_dists, 0.7*nn_avg_dist, orientation)
# Create dataframe of groups (OLS model requires a dataframe)
df = pd.DataFrame(np.row_stack(groups), columns=['x', 'y', 'grp'])
df['x'] = pd.to_numeric(df['x'])
df['y'] = pd.to_numeric(df['y'])
# Parallel slopes OLS model
ols_model = ols("y ~ x + grp + 0", data=df).fit()
# OLS parameters
grid_lines = ols_model.params[:-1].to_frame() # panda series to dataframe
grid_lines = grid_lines.rename(columns = {0:'b'})
grid_lines['grp'] = grid_lines.index.str[4:6]
grid_lines['m'] = ols_model.params[-1] # slope
# Rotate the rotated lines back to their original orientation
grid_lines.loc[grid_lines['grp'].str[0] == 'r', 'b'] = grid_lines['b'] / grid_lines['m']
grid_lines.loc[grid_lines['grp'].str[0] == 'r', 'm'] = -1 / grid_lines['m']
# Find grid intersection points by combinations of gridlines
comb = list(itertools.combinations(grid_lines['grp'], 2))
comb = [i for i in comb if i[0][0] != 'r']
comb = [i for i in comb if i[1][0] != 'o']
df_comb = pd.DataFrame(comb, columns=['grp', 'r_grp'])
# Merge gridline parameters with grid points
grid_pts = df_comb.merge(grid_lines.drop_duplicates('grp'),how='left',on='grp')
grid_lines.rename(columns={'grp': 'r_grp'}, inplace=True)
grid_pts.rename(columns={'b':'o_b', 'm': 'o_m', 'grp':'o_grp'}, inplace=True)
grid_pts = grid_pts.merge(grid_lines.drop_duplicates('r_grp'),how='left',on='r_grp')
grid_pts.rename(columns={'b':'r_b', 'm': 'r_m'}, inplace=True)
# Calculate x, y coordinates of gridline interception points
grid_pts['x'] = (grid_pts['r_b']-grid_pts['o_b']) \
/ (grid_pts['o_m']-grid_pts['r_m'])
grid_pts['y'] = grid_pts['o_m'] * grid_pts['x'] + grid_pts['o_b']
# Results output
print(grid_lines)
print(grid_pts)
plt.scatter(grid_pts['x'], grid_pts['y'], s=8, c='b') # for setting axes
axes = plt.gca()
axes.invert_yaxis()
axes.xaxis.tick_top()
axes.set_aspect('equal')
axes.set_xlim(axes.get_xlim())
axes.set_ylim(axes.get_ylim())
x_vals = np.array(axes.get_xlim())
for idx in grid_lines.index:
y_vals = grid_lines['b'][idx] + grid_lines['m'][idx] * x_vals
plt.plot(x_vals, y_vals, c='gray')
plt.show()

A numpy implementation of your code can be found below. As the size AvgGrid is known, I pre-allocate the required memory (rather than append). This should have speed advantages, especially if the number of output vertices is large.
import numpy as np
# Input of [x, y] coordinates of a sparse grid with errors
xys = np.array([[103,101],
[198,103],
[300, 99],
[ 97,205],
[304,202],
[102,295],
[200,303],
[104,405],
[205,394],
[298,401]])
# Function to average
def ColAvgs(CoordinateList, CutoffRatio = 1.1):
# Length of CoordinateList
L = len(CoordinateList)
# Sort input
SortedList = np.sort(CoordinateList)
# Determine indices to average
RelativeIncrease = SortedList[-(L-1):]/SortedList[:(L-1)]
CriticalIndices = np.flatnonzero(RelativeIncrease > CutoffRatio) + 1
Indices = np.hstack((0,CriticalIndices))
if (Indices[-1] != L):
Indices = np.hstack((Indices,L))
#print(Indices) # Uncomment to show index construction
# Compute averages
Avgs = np.empty((len(Indices)-1)); Avgs[:] = np.NaN
for iter in range(len(Avgs)):
Avgs[iter] = int( round(np.mean(SortedList[Indices[iter]:Indices[(iter+1)]]) ) )
# Return output
return Avgs
# Compute x- and y-coordinates of vertices
AvgsXcoord = ColAvgs(xys[:,0])
AvgsYcoord = ColAvgs(xys[:,1])
# Return all vertices
AvgGrid = np.empty((len(AvgsXcoord)*len(AvgsYcoord),2)); AvgGrid[:] = np.NaN
iter = 0
for y in AvgsYcoord:
for x in AvgsXcoord:
AvgGrid[iter, :] = np.hstack((x,y))
iter = iter+1
print(AvgGrid)

If you project all points on a vertical or horizontal axis, the problem turns to one of clustering with equally spaced clusters.
To perform these clusterings, you can consider the distances between the successive (sorted) points. They will form two clusters: short distances corresponding to noise, and longer ones for the grid size. You can solve the two-way clustering using the Otsu method.

Related

Voronoi diagram using python to make exact hexagons

I am trying to make a hexagonal fill by the voronoi diagram. One problem I find is that although the plot it produces is a hexagon diagram, the distances between the points vary.
The first function is to give a voronoi diagram by exact hexagons. Then I am trying to assign a universal initial distance between each cells as a spring rest length between them.
Now my problem is that the initial hexagonal diagram gives non-universal length between cells. We can see it by the printed result given by the line "print(a)" in the code. However, I assigned the coordinates of the points by 'x = (col + (0.5 * (row % 2))) * np.sqrt(3)' and 'y = row * 0.5', which should give exact hexagons. I don't understand how I am getting different distances between points.
The following is my code, and mostly the second function part is about finding neighbors to each cell and computing distances between each cell and its neighbors. I am printing the distances by 'print(a)' line.
import numpy as np
import freud
import matplotlib.pyplot as plt
from scipy.spatial import Delaunay
from collections import defaultdict
import itertools
# Source: https://freud.readthedocs.io/en/v2.10.0/gettingstarted/examples/module_intros/locality.Voronoi.html
def hexagonal_lattice(rows=3, cols=3, noise=.0, seed=None):
if seed is not None:
np.random.seed(seed)
# Assemble a hexagonal lattice
points = []
for row in range(rows * 2):
for col in range(cols):
x = (col + (0.5 * (row % 2))) * np.sqrt(3)
y = row * 0.5 # These x,y are allocated to produce exact hexagons
points.append((x, y, 0))
points = np.asarray(points)
points += np.random.multivariate_normal(
mean=np.zeros(3), cov=np.eye(3) * noise, size=points.shape[0]
)
# Set z=0 again for all points after adding Gaussian noise
# points[:, 2] = 0 # do not see the need. Seems wrap later changes z coordi to 0
# Wrap the points into the box
box = freud.box.Box(Lx=cols * np.sqrt(3), Ly=rows, is2D=True)
points = box.wrap(points) # 주어진 그림박스 안으로 periodic bdy 써서 넣어주는 역할
return box, points
# Compute the Voronoi diagram and plot
box1, pts1 = hexagonal_lattice(rows=12, cols=12, seed=2) # Noise = 0
voro = freud.locality.Voronoi()
voro.compute((box1, pts1))
plt.figure()
ax = plt.gca()
voro.plot(ax=ax, cmap="RdBu")
ax.scatter(pts1[:, 0], pts1[:, 1], s=2, c='k')
plt.show()
# This part is for the stability check of the initial exact hexagons diagram
def cell_movement(box, points, time_length, Lambda=0.01):
time = 1
while time <= time_length:
# 2D projection + neighboring cells
points_2d = []
for point in points:
points_2d.append([point[0], point[1]]) # projection to 2d for neighbor list
points_2d = np.asarray(points_2d)
tri = Delaunay(points_2d)
neiList = defaultdict(set) # Neighbor list for each cell
for p in tri.vertices:
for i, j in itertools.combinations(p, 2):
neiList[i].add(j)
neiList[j].add(i)
neiborList = sorted(neiList.items()) # Sorted neighbor array
spring = np.ones((len(points[:, 0]), len(points[:, 0]))) # Initial spring rest length
rintervec = np.empty((len(points[:, 0]), len(points[:, 0]), 2)) # spring length array
for i in range(len(neiborList)):
for j in list(neiborList[i][1]):
j = int(j)
rintervec[i, j] = points_2d[i] - points_2d[j] # Distance vector between i,j cells
a = np.linalg.norm(rintervec[i, j]) # Distances between neighboring cells
if a != 0:
print(a) # These are the printed numbers
spring[i, j] = np.linalg.norm(rintervec[i, j]) # Assign a as spring rest length
points_2d[i] += Lambda * rintervec[i, j] * ( # moves points by equation (8)
spring[i, j] - np.linalg.norm(rintervec[i, j])) / np.linalg.norm(rintervec[i, j])
points[i] = np.append(points_2d[i], np.array([0]))
# diagram
points = box.wrap(points) # 주어진 그림박스 안으로 periodic bdy 써서 넣어주는 역할
voro.compute((box, points)) # Computing the Voronoi diagram
# figure
plt.figure()
ax = plt.gca()
voro.plot(ax=ax, cmap="RdBu")
ax.scatter(points[:, 0], points[:, 1], s=2, c='k')
plt.savefig("C:\\doit\\pythonPractice\\At time %s.png" % time) # saves diagrams
plt.show()
time = time + 1
cell_movement(box1, pts1, time_length=5)

Generating random points in a box

I want to generate random points in a box (a=0.2m, b=0.2m, c=1m). This points should have random distance between each other but minimum distance between two points is should be 0.03m, for this I used random.choice. When I run my code it generates random points but distance management is so wrong. Also my float converting approximation is terrible because I don't want to change random values which I generate before but I couldn't find any other solution. I'm open to suggestions.
Images
graph1
graph2
import random
import matplotlib.pyplot as plt
# BOX a = 0.2m b=0.2m h=1m
save = 0 #for saving 3 different plot.
for k in range(3):
pointsX = [] #information of x coordinates of points
pointsY = [] #information of y coordinates of points
pointsZ = [] #information of z coordinates of points
for i in range(100): #number of the points
a = random.uniform(0.0,0.00001) #for the numbers generated below are float.
x = random.choice(range(3, 21,3)) #random coordinates for x
x1 = x/100 + a
pointsX.append(x1)
y = random.choice(range(3, 21,3)) #random coordinates for y
y1 = y/100 + a
pointsY.append(y1)
z = random.choice(range(3, 98,3)) #random coordinates for z
z1 = z/100 + a
pointsZ.append(z1)
new_pointsX = list(set(pointsX)) # deleting if there is a duplicates
new_pointsY = list(set(pointsY))
new_pointsZ = list(set(pointsZ))
# i wonder max and min values it is or not between borders.
print("X-Min", min(new_pointsX))
print("X-Max", max(new_pointsX))
print("Y-Min", min(new_pointsY))
print("Y-Max", max(new_pointsY))
print("Z-Min", min(new_pointsZ))
print("Z-Max", max(new_pointsZ))
if max(new_pointsX) >= 0.2 or max(new_pointsY) >= 0.2:
print("MAX VALUE GREATER THAN 0.2")
if max(new_pointsZ) >= 0.97:
print("MAX VALUE GREATER THAN 0.97")
#3D graph
fig = plt.figure(figsize=(18,9))
ax = plt.axes(projection='3d')
ax.set_xlim([0, 0.2])
ax.set_ylim([0, 0.2])
ax.set_zlim([0, 1])
ax.set_title('title',fontsize=18)
ax.set_xlabel('X',fontsize=14)
ax.set_ylabel('Y',fontsize=14)
ax.set_zlabel('Z',fontsize=14)
ax.scatter3D(new_pointsX, new_pointsY, new_pointsZ);
save += 1
plt.savefig("graph" + str(save) + ".png", dpi=900)
As mentioned in the comments by #user3431635, you can check each point with all previous points before appending that new point to the list. I would do that something like this:
import random
import numpy as np
import matplotlib.pyplot as plt
plt.close("all")
a = 0.2 # x bound
b = 0.2 # y bound
c = 1.0 # z bound
N = 1000 # number of points
def distance(p, points, min_distance):
"""
Determines if any points in the list are less than the minimum specified
distance apart.
Parameters
----------
p : tuple
`(x,y,z)` point.
points : ndarray
Array of points to check against. `x, y, z` points are columnwise.
min_distance : float
Minimum allowable distance between any two points.
Returns
-------
bool
True if point `p` is at least `min_distance` from all points in `points`.
"""
distances = np.sqrt(np.sum((p+points)**2, axis=1))
distances = np.where(distances < min_distance)
return distances[0].size < 1
points = np.array([]) # x, y, z columnwise
while points.shape[0] < 1000:
x = random.choice(np.linspace(0, a, 100000))
y = random.choice(np.linspace(0, b, 100000))
z = random.choice(np.linspace(0, c, 100000))
p = (x,y,z)
if len(points) == 0: # add first point blindly
points = np.array([p])
elif distance(p, points, 0.03): # ensure the minimum distance is met
points = np.vstack((points, p))
fig = plt.figure(figsize=(18,9))
ax = plt.axes(projection='3d')
ax.set_xlim([0, a])
ax.set_ylim([0, b])
ax.set_zlim([0, c])
ax.set_title('title',fontsize=18)
ax.set_xlabel('X',fontsize=14)
ax.set_ylabel('Y',fontsize=14)
ax.set_zlabel('Z',fontsize=14)
ax.scatter(points[:,0], points[:,1], points[:,2])
Note, this might not be the randomness you're looking for. I have written it to take the range of x, y, and z values and split it into 100000 increments; a new x, y, or z point is then chosen from those values.

Generating equidistance points along the boundary of a polygon but CW/CCW

Suppose I have the vertices of a polygon and they are all oriented CCW. I wish to generate n equidistance points along the boundary of this polygon. Does anyone know of any existing package that does this, and if not, an algorithm one can use? I am working in Python. For example, here is what I would like if the polygon in question is a rectangle:
enter image description here
shapely:
import shapely.geometry as sg
import shapely.affinity as sa
import matplotlib.pyplot as P
import numpy as np
n = 7
k = 11
ori = sg.Point([0,0])
p = [sg.Point([0,1])]
for j in range(1,n):
p.append(sa.rotate(p[-1],360/n,origin=ori))
ngon = sg.Polygon(p)
P.figure()
P.plot(*ngon.exterior.xy,"-k")
P.scatter(*np.transpose([ngon.exterior.interpolate(t).xy for t in np.linspace(
0,ngon.length,k,False)])[0])
P.axis("equal");P.box("off");P.axis("off")
P.show(block=0)
To add to the possible solutions, and so I have a record. This incarnation just uses numpy to densify the boundary of polygons or polylines.
def _pnts_on_line_(a, spacing=1, is_percent=False): # densify by distance
"""Add points, at a fixed spacing, to an array representing a line.
**See** `densify_by_distance` for documentation.
Parameters
----------
a : array
A sequence of `points`, x,y pairs, representing the bounds of a polygon
or polyline object.
spacing : number
Spacing between the points to be added to the line.
is_percent : boolean
Express the densification as a percent of the total length.
Notes
-----
Called by `pnt_on_poly`.
"""
N = len(a) - 1 # segments
dxdy = a[1:, :] - a[:-1, :] # coordinate differences
leng = np.sqrt(np.einsum('ij,ij->i', dxdy, dxdy)) # segment lengths
if is_percent: # as percentage
spacing = abs(spacing)
spacing = min(spacing / 100, 1.)
steps = (sum(leng) * spacing) / leng # step distance
else:
steps = leng / spacing # step distance
deltas = dxdy / (steps.reshape(-1, 1)) # coordinate steps
pnts = np.empty((N,), dtype='O') # construct an `O` array
for i in range(N): # cycle through the segments and make
num = np.arange(steps[i]) # the new points
pnts[i] = np.array((num, num)).T * deltas[i] + a[i]
a0 = a[-1].reshape(1, -1) # add the final point and concatenate
return np.concatenate((*pnts, a0), axis=0)
Results for a hexagon densified at 4 unit point spacing.
h = np.array([[-8.66, 5.00], [ 0.00, 10.00], [ 8.66, 5.00], [ 8.66, -5.00],
[ 0.00,-10.00], [-8.66, -5.00], [-8.66, 5.00]]) # ---- hexagon
Addendum
As suggested in a comment, I will add the densify by factor incarnation
h = np.array([[-8.66, 5.00], [-2.5, 5.0], [ 0.00, 10.00], [2.5, 5.0], [ 8.66, 5.00], [ 8.66, -5.00], [ 0.00,-10.00], [-8.66, 5.00]]) # ---- a polygon
_densify_by_factor(h, factor=3)
def _densify_by_factor(a, factor=2):
"""Densify a 2D array using np.interp.
Parameters
----------
a : array
A 2D array of points representing a polyline/polygon boundary.
fact : number
The factor to density the line segments by.
"""
a = np.squeeze(a)
n_fact = len(a) * factor
b = np.arange(0, n_fact, factor)
b_new = np.arange(n_fact - 1) # Where you want to interpolate
c0 = np.interp(b_new, b, a[:, 0])
c1 = np.interp(b_new, b, a[:, 1])
n = c0.shape[0]
c = np.zeros((n, 2))
c[:, 0] = c0
c[:, 1] = c1
return c

How to find intersection of a line with a mesh?

I have trajectory data, where each trajectory consists of a sequence of coordinates(x, y points) and each trajectory is identified by a unique ID.
These trajectories are in x - y plane, and I want to divide the whole plane into equal sized grid (square grid). This grid is obviously invisible but is used to divide trajectories into sub-segments. Whenever a trajectory intersects with a grid line, it is segmented there and becomes a new sub-trajectory with new_id.
I have included a simple handmade graph to make clear what I am expecting.
It can be seen how the trajectory is divided at the intersections of the grid lines, and each of these segments has new unique id.
I am working on Python, and seek some python implementation links, suggestions, algorithms, or even a pseudocode for the same.
Please let me know if anything is unclear.
UPDATE
In order to divide the plane into grid , cell indexing is done as following:
#finding cell id for each coordinate
#cellid = (coord / cellSize).astype(int)
cellid = (coord / 0.5).astype(int)
cellid
Out[] : array([[1, 1],
[3, 1],
[4, 2],
[4, 4],
[5, 5],
[6, 5]])
#Getting x-cell id and y-cell id separately
x_cellid = cellid[:,0]
y_cellid = cellid[:,1]
#finding total number of cells
xmax = df.xcoord.max()
xmin = df.xcoord.min()
ymax = df.ycoord.max()
ymin = df.ycoord.min()
no_of_xcells = math.floor((xmax-xmin)/ 0.5)
no_of_ycells = math.floor((ymax-ymin)/ 0.5)
total_cells = no_of_xcells * no_of_ycells
total_cells
Out[] : 25
Since the plane is now divided into 25 cells each with a cellid. In order to find intersections, maybe I could check the next coordinate in the trajectory, if the cellid remains the same, then that segment of the trajectory is in the same cell and has no intersection with grid. Say, if x_cellid[2] is greater than x_cellid[0], then segment intersects vertical grid lines. Though, I am still unsure how to find the intersections with the grid lines and segment the trajectory on intersections giving them new id.
This can be solved by shapely:
%matplotlib inline
import pylab as pl
from shapely.geometry import MultiLineString, LineString
import numpy as np
from matplotlib.collections import LineCollection
x0, y0, x1, y1 = -10, -10, 10, 10
n = 11
lines = []
for x in np.linspace(x0, x1, n):
lines.append(((x, y0), (x, y1)))
for y in np.linspace(y0, y1, n):
lines.append(((x0, y), (x1, y)))
grid = MultiLineString(lines)
x = np.linspace(-9, 9, 200)
y = np.sin(x)*x
line = LineString(np.c_[x, y])
fig, ax = pl.subplots()
for i, segment in enumerate(line.difference(grid)):
x, y = segment.xy
pl.plot(x, y)
pl.text(np.mean(x), np.mean(y), str(i))
lc = LineCollection(lines, color="gray", lw=1, alpha=0.5)
ax.add_collection(lc);
The result:
To not use shapely, and do it yourself:
import pylab as pl
import numpy as np
from matplotlib.collections import LineCollection
x0, y0, x1, y1 = -10, -10, 10, 10
n = 11
xgrid = np.linspace(x0, x1, n)
ygrid = np.linspace(y0, y1, n)
x = np.linspace(-9, 9, 200)
y = np.sin(x)*x
t = np.arange(len(x))
idx_grid, idx_t = np.where((xgrid[:, None] - x[None, :-1]) * (xgrid[:, None] - x[None, 1:]) <= 0)
tx = idx_t + (xgrid[idx_grid] - x[idx_t]) / (x[idx_t+1] - x[idx_t])
idx_grid, idx_t = np.where((ygrid[:, None] - y[None, :-1]) * (ygrid[:, None] - y[None, 1:]) <= 0)
ty = idx_t + (ygrid[idx_grid] - y[idx_t]) / (y[idx_t+1] - y[idx_t])
t2 = np.sort(np.r_[t, tx, tx, ty, ty])
x2 = np.interp(t2, t, x)
y2 = np.interp(t2, t, y)
loc = np.where(np.diff(t2) == 0)[0] + 1
xlist = np.split(x2, loc)
ylist = np.split(y2, loc)
fig, ax = pl.subplots()
for i, (xp, yp) in enumerate(zip(xlist, ylist)):
pl.plot(xp, yp)
pl.text(np.mean(xp), np.mean(yp), str(i))
lines = []
for x in np.linspace(x0, x1, n):
lines.append(((x, y0), (x, y1)))
for y in np.linspace(y0, y1, n):
lines.append(((x0, y), (x1, y)))
lc = LineCollection(lines, color="gray", lw=1, alpha=0.5)
ax.add_collection(lc);
You're asking a lot. You should attack most of the design and coding yourself, once you have a general approach. Algorithm identification is reasonable for Stack Overflow; asking for design and reference links is not.
I suggest that you put the point coordinates into a list. use the NumPy and SciKit capabilities to interpolate the grid intersections. You can store segments in a list (of whatever defines a segment in your data design). Consider making a dictionary that allows you to retrieve the segments by grid coordinates. For instance, if segments are denoted only by the endpoints, and points are a class of yours, you might have something like this, using the lower-left corner of each square as its defining point:
grid_seg = {
(0.5, 0.5): [p0, p1],
(1.0, 0.5): [p1, p2],
(1.0, 1.0): [p2, p3],
...
}
where p0, p1, etc. are the interpolated crossing points.
Each trajectory is composed of a series of straight line segments. You therefore need a routine to break each line segment into sections that lie completely within a grid cell. The basis for such a routine would be the Digital Differential Analyzer (DDA) algorithm, though you'll need to modify the basic algorithm since you need endpoints of the line within each cell, not just which cells are visited.
A couple of things you have to be careful of:
1) If you're working with floating point numbers, beware of rounding errors in the calculation of the step values, as these can cause the algorithm to fail. For this reason many people choose to convert to an integer grid, obviously with a loss of precision. This is a good discussion of the issues, with some working code (though not python).
2) You'll need to decide which of the 4 grid lines surrounding a cell belong to the cell. One convention would be to use the bottom and left edges. You can see the issue if you consider a horizontal line segment that falls on a grid line - does its segments belong to the cell above or the cell below?
Cheers
data = list of list of coordinates
For point_id, point_coord in enumerate(point_coord_list):
if current point & last point stayed in same cell:
append point's index to last list of data
else:
append a new empty list to data
interpolate the two points and add a new point
that is on the grid lines.
Data stores all trajectories. Each list within the data is a trajectory.
The cell index along x and y axes (x_cell_id, y_cell_id) can be found by dividing coordinate of point by dimension of cell, then round to integer. If the cell indices of current point are same as that of last points, then these two points are in the same cell. list is good for inserting new points but it is not as memory efficient as arrays.
Might be a good idea to create a class for trajectory. Or use a memory buffer and sparse data structure instead of list and list and an array for the x-y coordinates if the list of coordinates wastes too much memory.
Inserting new points into array is slow, so we can use another array for new points.
Warning: I haven't thought too much about the things below. It probably has bugs, and someone needs to fill in the gaps.
# coord n x 2 numpy array.
# columns 0, 1 are x and y coordinate.
# row n is for point n
# cell_size length of one side of the square cell.
# n_ycells number of cells along the y axis
import numpy as np
cell_id_2d = (coord / cell_size).astype(int)
x_cell_id = cell_id_2d[:,0]
y_cell_id = cell_id_2d[:,1]
cell_id_1d = x_cell_id + y_cell_id*n_x_cells
# if the trajectory exits a cell, its cell id changes
# and the delta_cell_id is not zero.
delta_cell_id = cell_id_1d[1:] - cell_id_1d[:-1]
# The nth trajectory should contains the points from
# the (crossing_id[n])th to the (crossing_id[n + 1] - 1)th
w = np.where(delta_cell_id != 0)[0]
crossing_ids = np.empty(w.size + 1)
crossing_ids[1:] = w
crossing_ids[0] = 0
# need to interpolate when the trajectory cross cell boundary.
# probably can replace this loop with numpy functions/indexing
new_points = np.empty((w.size, 2))
for i in range(1, n):
st = coord[crossing_ids[i]]
en = coord[crossing_ids[i+1]]
# 1. check which boundary of the cell is crossed
# 2. interpolate
# 3. put points into new_points
# Each trajectory contains some points from coord array and 2 points
# in the new_points array.
For retrieval, make a sparse array that contains the index of the starting point in the coord array.
Linear interpolation can look bad if the cell size is large.
Further explanation:
Description of the grid
For n_xcells = 4, n_ycells = 3, the grid is:
0 1 2 3 4
0 [ ][ ][ ][ ][ ]
1 [ ][ ][ ][* ][ ]
2 [ ][ ][ ][ ][ ]
[* ] has an x_index of 3 and a y_index of 1.
There are (n_x_cells * n_y_cells) cells in the grid.
Relationship between point and cell
The cell that contains the ith point of the trajectory has an x_index of x_cell_id[i] and a y_index of x_cell_id[i]. I get this by discretization through dividing the xy-coordinates of the points by the length of the cell and then truncate to integers.
The cell_id_1d of the cells are the number in [ ]
0 1 2 3 4
0 [0 ][1 ][2 ][3 ][4 ]
1 [5 ][6 ][7 ][8 ][9 ]
2 [10][11][12][13][14]
cell_id_1d[i] = x_cell_id[i] + y_cell_id[i]*n_x_cells
I converted the pair of cell indices (x_cell_id[i], y_cell_id[i]) for the ith point to a single index called cell_id_1d.
How to find if trajectory exit a cell at the ith point
Now, the ith and (i + 1)th points are in same cell, if and only if (x_cell_id[i], y_cell_id[i]) == (x_cell_id[i + 1], y_cell_id[i + 1]) and also cell_id_1d[i] == cell_id[i + 1], and cell_id[i + 1] - cell_id[i] == 0. delta_cell_ids[i] = cell_id_1d[i + 1] - cell_id[i], which is zero if and only the ith and (i + 1)th points are in the same cell.

Fast 3 to 7 D interpolation on non uniform & non rectangular grid

[COMPLETELY REWRITTEN]
I'm looking for a way of interpolating large set of data in dimensions ranging from 3 to 7.
The data is, by nature, on a non rectangular grid and non uniformly spaced.
I looked every option I could think of (griddata, KDTree + magic, linear interpolation, reworked map_coordinates...): the fastest and most usable tool seems to be Scipy's LinearNDInterpolator function. Linear interpolation in such high dimensions space is fine and should be precise enough.
However, there is one big shortcoming with this class: data with gaps or "concave regions" will produce extrapolated results when I want only interpolation.
This is best seen with some pictures (2-D test). In the following I produce some randomly generated data for X, and VALUE, while Y is upper-bounded by a function of X (just so I create gaps).
After rescaling data (mostly done using pieces of code of the LinearNDIterpolator from the master, ie. development, branch), Delaunay triangulation will produce a Convex Hull that includes the gap, and will "extrapolate" in this region. The term "extrapolate" is not really correct here, in a technical sense, but I think would be appropriate given the fact original data is assumed to be sufficiently well sampled so that the big gaps means "no data allowed" (not physical).
To start handling the problem, I "tagged" every Delaunay (hyper-)triangles whose (hyper-)volume is higher than a user-defined threshold (by default the volume equivalent to 5% of the data extent in each dimension).
Generating random data, and evaluating the values using this technique would produce the following figure:
Black dots (with red or white rings) is the randomly generated data to be evaluated. Red rings indicates points that are being rejected (ie. value = NaN) by my custom class based on LinearNDInterpolator, and white rings show accepted points.
For clarity I've plotted triangles that have been rejected from the original Delaunay triangulation.
As you can see, there are still some white rings points that fall in the gap, which I do not want. This is because the simplex to which they belong has a volume less than the authorized maximum volume (some of these triangles even appear as lines on the figure, so it is hard to see)
My question is: how could I improve from here? What could be done?
I was thinking of grabbing all points that fall in a small ball around each evaluated point, and see if there are points in that. But this is not a good solution since it would be resource-consuming and not precise enough (eg. what about points very close to the bottom of the gap, but yet outside the upper envelope?)
Here is my custom interpolation module I used:
#!/usr/bin/env python
"""
Custom N-D linear interpolation class, based on scipy's LinearNDInterpolator.
The main differences are:
- auto-scaling
- interpolation: inside convex hull (normal behavior), and "close enough" to original data.
This rejects points that would normally be interpolated by LinearNDInterpolator.
"""
# ================
# Python modules
# ================
import cPickle
import numpy as np
from scipy.spatial import Delaunay
from scipy.interpolate import LinearNDInterpolator
from scipy.misc import factorial
# =======================
# Convenience functions
# =======================
def _inv_log10(x):
return 10**x
def det(coords): #, n):
"""
Return the determinant of the given coordinates (not the usual determinant, but the one used to compute
the hyper-volume of an hyper-triangle)
From a Delaunay triangulation, the coordinates of one simplex (ie. hyper-triangle) is given by:
coords_i = tri.points[simplex_i]
where
tri = Delaunay(points)
simplex_i = tri.simplices[i]
In an N-dimensional space, the simplex will have N+1 points, each one of them of dimension N.
Eg. in 3D, a points i has coordinates pi = (xi, yi, zi). Therefore p1 = points 1 = (x1, y1, z1)
|x1 x2 x3 x4|
|y1 y2 y3 y4| |(x1-x4) (x2-x4) (x3-x4)|
det = |z1 z2 z3 z4| = |(y1-y4) (y2-y4) (y3-y4)|
|1 1 1 1 | |(z1-z4) (z2-z4) (z3-z4)|
"""
# assert n == len(coords[0]), 'number of dimensions of coordinates (%d) != %d' % (len(coords[0]), n)
q = coords[:-1, :] - coords[-1, None, :]
sign, logdet = np.linalg.slogdet(q)
return sign * np.exp(logdet)
# ==============================
# LinearNDInterpolator wrapper
# ==============================
class Interp(object):
"""
Simple wrapper around LinearNDInterpolator.
"""
def __init__(self, points, values, **kwargs):
"""
:param points: list of coordinates (eg. [(0, 1), (0, 3), (4, 4.5)] for 3 points in 2-D)
:param values: list of associated value(s) for each point (eg. [1, 2, 3] for 3 points of single value)
:keyword rescale: rescale data points so that the final extents is [0, 1] in every dimensions
:keyword transform: transform data points (prior to rescaling). If True, automatically transform dimension coordinates
if extents span more than 2 order of magnitudes. It can also be a list of tuples of
(transformation function, inverse function), that will be applied whenever needed.
:keyword fill_value: outside bounds interpolation values (default: np.nan)
"""
try:
points = np.asanyarray(points, dtype=np.float64)
values = np.asanyarray(values, dtype=np.float64)
except ValueError:
raise ValueError('Cannot convert input points to an array of floats')
# dimensions / number of points and values
self.ndim = points.shape[1]
self.nvalues = values.shape[1]
self.npoints = points.shape[0]
# locals
self._idims = range(self.ndim)
# extents
self.minis = np.min(points, axis=0)
self.maxis = np.max(points, axis=0)
self.ranges = self.maxis - self.minis
self.magnitudes = self.maxis / self.minis
# options
rescale = kwargs.pop('rescale', True)
transform = kwargs.pop('transform', True)
fill_value = kwargs.pop('fill_value', np.nan)
# transformation
if transform:
transforms = []
if transform is True:
# automatic transformation -> if extent >= 2 order of magnitudes: f(x) = log10(x)
for i, e in enumerate(self.magnitudes):
if e >= 100.:
transforms.append((np.log10, _inv_log10))
else:
transforms.append(None)
if not transforms:
transforms = None
else:
err_msg = 'transform: both the transformation function and its inverse must be given in a tuple'
if not isinstance(transform, (tuple, list)):
raise ValueError(err_msg)
if (self.ndim > 1) and (len(transform) != self.ndim):
raise ValueError('transform: None or transformations tuple must be given for every dimension')
for t in transform:
if not isinstance(t, (tuple, list)):
raise ValueError(err_msg)
elif t is None:
transforms.append(None)
else:
transforms.append(t)
self.transforms = transforms
else:
self.transforms = None
points = self._transform(points)
# scaling
self.offset = 0.
self.scale = 1.
self.rescale = rescale
if rescale:
self.offset = np.mean(points, axis=0)
self.scale = (points - self.offset).ptp(axis=0)
self.scale[~(self.scale > 0)] = 1.0 # avoid division by 0
points = self._rescale(points)
# triangulation
self.tri = self._triangulate(points)
# volumes
self.fact = 1. / factorial(self.ndim)
self.volume_max = np.product(self.tri.points.ptp(axis=0) * 0.05) # 5% peak-to-peak in each dimension
self.rej_idx = None
self.rej_vol = None
self.cached_rej = False
# linear interpolation
self.fill_value = fill_value
self.func = LinearNDInterpolator(self.tri, values, fill_value=fill_value)
def _triangulate(self, points, **kwargs):
"""
Delaunay triangulation
"""
return Delaunay(points, **kwargs)
def _get_volume_simplex(self, point):
"""
Compute the simplex volume of the given point
"""
i = self.tri.find_simplex(point)
idx = self.tri.simplices[i]
return np.abs(self.fact * det(self.tri.points[idx]))
def cache_rejected_triangles(self, p=None, check_min=False):
"""
Cache the indexes of rejected triangles.
OPTIONS
p -- peak-to-peak percentage in each dimension for the maximum volume calculation
Default: None (default at __init__: p = 0.05)
Type: float (0 < p <= 1)
Type: list of floats (length = # dimensions)
check_min -- check that the minimum spacing in each dimension is at least equal to p * extent
Default: False
Warning: *p* must be given
"""
self.cached_rej = True
if p is not None:
p = np.array(p)
# update the maximum hyper-triangle volume (p % of the extent in each dimension)
self.volume_max = np.product(self.tri.points.ptp(axis=0) * p)
if check_min:
assert p is not None, 'You must give *p* parameter for checking minimum volume of hyper-triangle'
ptps = self.tri.points.ptp(axis=0)
ps = np.ones(self.ndim) * p
n_up = 0
for i in self._idims:
_x = np.unique(self.tri.points[:, i])
mini = np.min(_x[1:] - _x[:-1])
if mini > (ptps[i] * ps[i]):
n_up += 1
print 'WARNING: changed max. volume axis of dim. %d from %.3g to %.3g' % (i+1, ps[i], mini)
ps[i] = mini
if n_up:
new_vol = np.product(ptps * ps)
print 'CHANGE: old volume was = %.3g, and is now = %.3g' % (self.volume_max, new_vol)
self.volume_max = new_vol
rej_idx = []
rej_vol = []
for i, simplex in enumerate(self.tri.simplices):
vol = np.abs(self.fact * det(self.tri.points[simplex]))
if vol > self.volume_max:
rej_idx.append(i)
rej_vol.append(vol)
self.rej_idx = np.array(rej_idx)
self.rej_vol = np.array(rej_vol)
def _transform(self, points, inverse=False):
"""
Transform point coordinates using functions. Set 'inverse' to True to transform back.
"""
if self.transforms is not None:
j = 1 - int(inverse)
for i in self._idims:
t = self.transforms[i]
if t is None:
continue
points[:, i] = t[j](points[:, i])
return points
def _rescale(self, points, inverse=False):
"""
Rescale point coordinates so that extents in each dimensions span [0, 1]. Set 'inverse' to True to scale back.
"""
if self.rescale:
if inverse:
points = points * self.scale + self.offset
else:
points = (points - self.offset) / self.scale
return points
def _check(self, x, res):
"""
Check that interpolation results are close enough to real data and have not been extrapolated.
"""
points = np.asanyarray(x)
if points.ndim == 1:
# only 1 point
values = np.asanyarray(res).reshape(1, self.ndim)
else:
# more than 1 point
values = np.asanyarray(res).reshape(points.shape[0], self.ndim)
if self.cached_rej:
idx = np.unique(np.where(np.isfinite(values))[0])
ui_tri, uii = np.unique(self.tri.find_simplex(points[idx]), return_inverse=True)
umask = np.lib.arraysetops.in1d(ui_tri, self.rej_idx, assume_unique=True)
mask = umask[uii]
values[idx[mask], :] = self.fill_value
else:
for i, v in enumerate(values):
if not np.isnan(v[0]):
vol = self._get_volume_simplex(points[i])
if vol > self.volume_max:
# reject
values[i][:] = self.fill_value
return values.reshape(res.shape)
def __call__(self, x, check=False):
"""
Interpolate. If 'check' is True, check that interpolated points are close enough to real data.
"""
_x = self._rescale(self._transform(x))
res = self.func(_x)
if check:
res = self._check(_x, res)
return res
def ev(self, x, check=False):
"""
Alias for __call__
"""
return self.__call__(x, check=check)
def get_original_points(self):
"""
Return original points
"""
return self._transform(self._rescale(self.func.points, inverse=True), inverse=True)
def get_original_values(self):
"""
Return original values
"""
return self.func.values
# ===========================
# Save / load interpolation
# ===========================
def save(filename, interp):
"""
Dump the Interp instance to a binary file with cPickle (protocol 2)
"""
with open(filename, 'wb') as f:
cPickle.dump(interp, f, protocol=2)
def load(filename):
"""
Load a previously saved (cPickled with save_interp function) Interp instance
"""
with open(filename, 'rb') as f:
interp = cPickle.load(f)
return interp
And the test script:
#!/usr/bin/env python
"""
Test the custom interpolation class (see interp.py)
"""
import sys
import numpy as np
from interp import Interp
import matplotlib.pyplot as plt
# generate random data
n = 2000 # number of generated points
x = np.random.random(n)
def f(v):
maxi = v ** (1/(v+1e-5)) * (v - 5.) ** 2 - np.exp(v-7) + 1
return np.random.random() * maxi
y = map(f, x * 10)
z = np.random.random(n)
points = np.array((x, y)).T
values = np.random.random(points.shape)
# create interpolation function
func = Interp(points, values, transform=False)
func.cache_rejected_triangles(p=0.05, check_min=True)
# generate random data + evaluate
pts = np.random.random((500, points.shape[1]))
pts *= points.ptp(0)
pts += points.min(0)
res = func(pts, check=True)
# rejected points indexes
idx_rej = np.unique(np.where(np.isnan(res))[0])
n_rej = len(idx_rej)
print '%d points (%.0f%%) have been rejected' % (n_rej, 100.*n_rej/pts.shape[0])
# plot rejected triangles
fig = plt.figure()
ax = plt.gca()
for i in func.rej_idx:
_x = [p for p in points[func.tri.simplices[i], 0]]
_x += [points[func.tri.simplices[i][0], 0]]
_y = [p for p in points[func.tri.simplices[i], 1]]
_y += [points[func.tri.simplices[i][0], 1]]
ax.plot(_x, _y, c='k', ls='-', zorder=100)
# plot original data
ax.scatter(points[:, 0], points[:, 1], c='b', linewidths=0, s=20, zorder=50)
# plot all points (both accepted and rejected): in white
ax.scatter(pts[:, 0], pts[:, 1], c='k', edgecolors='w', linewidths=1, zorder=150, s=30)
# re-plot rejected points: in red
ax.scatter(pts[idx_rej, 0], pts[idx_rej, 1], c='k', edgecolors='r', linewidths=1, zorder=200, s=30)
fig.savefig('img_tri.png', transparent=True, dpi=300)

Categories