Related
I have a set of problems where I have multiple coordinates and I want to quickly detect where that coordinate lies within a list of pre-made boxes.
I provide a code for this type of problem. But I am using "for loop" iteration, which I think is quite slow given my current situation dealing with big data. That being set I might have thousands of boxes (rectangles/squares) and coordinates.
I am looking for any other alternative that may be efficient and fast for this type of problem?
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
pad = 1
xn1 = np.linspace(0-(pad*2), 0+(pad*2), 3)
yn1 = np.linspace(0-(pad*2), 0+(pad*2), 3)
print(xn1, yn1)
xn1_list = []
yn1_list = []
xy1_list = []
# Create the coordinates
for p1 in range(0, len(xn1)):
for q1 in range(0, len(yn1)):
xn1_list.append(xn1[p1])
yn1_list.append(yn1[q1])
for pad1, coord1 in enumerate(zip(xn1_list, yn1_list)):
xy1_list.append(coord1)
print('\nxy1_list',xy1_list)
# Plot
fig, (ax1) = plt.subplots(figsize = (8,8))
for i in range(0, len(xy1_list)):
# print(len(xy3_list))
plt.scatter(xy1_list[i][0],xy1_list[i][1])
ax1.add_patch(Rectangle(xy=(xy1_list[i][0]-(pad*2)/2, xy1_list[i][1]-(pad*2)/2) ,width=pad*2, height=pad*2,
linewidth=1, color='blue', fill=False))
ax1.annotate(i, xy=(xy1_list[i][0], xy1_list[i][1]), textcoords='data', fontsize = 15)
plt.xticks(np.arange(-3, 3.1, step=1))
plt.yticks(np.arange(-3, 3.1, step=1))
# Current coordinate
X_cur, Y_cur = 0.5,2
plt.scatter(X_cur, Y_cur, color='r', marker='x')
# Iterate every possible box in the list
for i in range(0,len(xy1_list)):
Xmin_temp, Xmax_temp = xy1_list[i][0]-pad, xy1_list[i][0]+pad
Ymin_temp, Ymax_temp = xy1_list[i][1]-pad, xy1_list[i][1]+pad
# Check if the current coordinate is in one of the boxes
if (Xmin_temp < X_cur <= Xmax_temp) & (Ymin_temp < Y_cur <= Ymax_temp):
print('Current Coordinate is in Box'+str(i)+' with a centre coordinate of ('+str(xy1_list[i][0])+', '+str(xy1_list[i][1])+')')
Visualize the results:
enter image description here
Edited version according to #Green Cloak Guy idea
from math import floor, ceil
subdivisions = {} # hashtable of subdivisions of the graph
# key format: 2-tuple (x, y)
# value format: 4-tuple (left, right, bottom, top)
...
# Plot
fig, (ax1) = plt.subplots(figsize = (8,8))
for i in range(0, len(xy1_list)):
print('\nCenter Coord of the box'+str(i)+' - '+str(xy1_list[i]))
# renaming your variables to be easier to work with
# assuming [x, y] is center of each square
width = height = pad * 2 # note that you could have differently-shaped rectangles
left = xy1_list[i][0] - (width / 2)
right = xy1_list[i][0] + (width / 2)
bottom = xy1_list[i][1] - (height / 2)
top = xy1_list[i][1] + (height / 2)
print('x1:'+str(left)+' x2: '+str(right)+' y1: '+str(bottom)+' y2: '+str(top))
# add rectangle to plot, as you do in your code
plt.scatter(xy1_list[i][0], xy1_list[i][1])
ax1.add_patch(Rectangle(xy=(xy1_list[i][0]-width/2, xy1_list[i][1]-height/2), width=width, height=height,
linewidth=1, color='blue', fill=False))
ax1.annotate(i, xy=(xy1_list[i][0], xy1_list[i][1]), textcoords='data', fontsize = 15)
# # add rectangle to the appropriate subdivision(s)
x, y = xy1_list[i][0], xy1_list[i][1]
subdivisions.setdefault((x,y), [])
subdivisions[(x,y)].append((left, right, bottom, top))
...
# Current coordinate
X_cur, Y_cur = 0.5,2
plt.scatter(X_cur, Y_cur, color='r', marker='x')
# find box(es) coordinate is in
# in this case, that would be the box going from (0, 0) to (2, 2),
# which, in the coordinate dictionary, would be (0, 0)
boxes_contained = [
box
for box in subdivisions.get((x,y),[])
if (box[0] <= X_cur <= box[1]) and (box[2] <= Y_cur <= box[3])
]
But it does not return anything in the boxes_contained. How can I fix this based on the new code?
The solution I use for this kind of problem is to subdivide the entire grid into manageably small segments, and track in each segment which rectangles (or other objects) intersect with them. Then, when we have a point, we can simply find which segment of the grid the point is in, and we only have to test against a subset of the total number of rectangles. This is relatively easy to do by iterating once over the list of rectangles.
from math import floor, ceil
sub_size_x, sub_size_y = 2, 2 # record the size of our subdivisions, e.g. 2x2
subdivisions = {} # hashtable of subdivisions of the graph
# key format: 2-tuple (x // sub_size_x, y // sub_size_y)
# value format: 4-tuple (left, bottom, width, height)
...
# Plot
fig, (ax1) = plt.subplots(figsize = (8,8))
for i in range(0, len(xy1_list)):
# renaming your variables to be easier to work with
# assuming [x, y] is center of each square
width = height = pad * 2 # note that you could have differently-shaped rectangles
left = xylist[i][0] - (width / 2)
bottom = xylist[i][1] - (height / 2)
right = xylist[i][0] + (width / 2)
top = xylist[i][1] + (height / 2)
# add rectangle to plot, as you do in your code
plt.scatter(left + (width / 2), bottom + (height / 2))
ax1.add_patch(Rectangle(xy=(left, top), width=width, height=height,
linewidth=1, color='blue', fill=False))
ax1.annotate(i, xy=(xy1_list[i][0], xy1_list[i][1]), textcoords='data', fontsize = 15)
# add rectangle to the appropriate subdivision(s)
for x in range(floor(left / sub_size_x), ceil(right / sub_size_x)):
for y in range(floor(bottom / sub_size_y), ceil(top / sub_size_y)):
subdivisions.setdefault((x, y), [])
subdivisions[(x, y)].append((left, bottom, width, height))
...
# Current coordinate
X_cur, Y_cur = 0.5,2
plt.scatter(X_cur, Y_cur, color='r', marker='x')
# find box(es) coordinate is in
# in this case, that would be the box going from (0, 0) to (2, 2),
# which, in the coordinate dictionary, would be (0, 0)
boxes_contained = [
box
for box in subdivisions.get((X_cur // sub_size_x, Y_cur // sub_size_y), [])
if (box[0] <= X_cur <= box[0] + box[2]) and (box[1] <= Y_cur <= box[1] + box[3])
]
(note: my current python installation does not have numpy or pyplot and I'm not in a position to be able to install them right now, so I have not tested this code. But you should get the picture)
Note that this approach works for an arbitrary-sized subdivision, and also for rectangles of different sizes. If you were to use an actual Rectangle or other Shape class instead of the 4-tuple value, then you could modify the code to call a method to check intersection.
Ideally your subdivisions should be sized such that you're not duplicating too much information, and such that not too many boxes overlap the same subdivision. What dimensions are ideal depends on the dataset of boxes you're working woth, and this is a tradeoff between space complexity and time complexity.
To give y'all some context, I'm doing this inversion technique where I am trying to reproduce a profile using the integrated values. To do that I need to find the value within an array along a certain line(s). To exemplify my issue I have the following code:
fig, ax = plt.subplots(1, figsize = (10,10))
#Create the grid (different grid spacing):
X = np.arange(0,10.01,0.25)
Y = np.arange(0,10.01,1.00)
#Create the 2D array to be plotted
Z = []
for i in range(np.size(X)):
Zaux = []
for j in range(np.size(Y)):
Zaux.append(i*j + j)
ax.scatter(X[i],Y[j], color = 'red', s = 0.25)
Z.append(Zaux)
#Mesh the 1D grids:
Ymesh, Xmesh = np.meshgrid(Y, X)
#Plot the color plot:
ax.pcolor(Y,X, Z, cmap='viridis', vmin=np.nanmin(Z), vmax=np.nanmax(Z))
#Plot the points in the grid of the color plot:
for i in range(np.size(X)):
for j in range(np.size(Y)):
ax.scatter(Y[j],X[i], color = 'red', s = 3)
#Create a set of lines:
for i in np.linspace(0,2,5):
X_line = np.linspace(0,10,256)
Y_line = i*X_line*3.1415-4
#Plot each line:
ax.plot(X_line,Y_line, color = 'blue')
ax.set_xlim(0,10)
ax.set_ylim(0,10)
plt.show()
That outputs this graph:
I need to find the closest points in Z that are being crossed by each of the lines. The idea is to integrate the values in Z that are crossed by the blue lines and plot that as a function of slope of the lines. Anyone has a good solution for it? I've tried a set of for loops, but I think it's kind of clunky.
Anyway, thanks for your time...
I am not sure about the closest points thing. That seems "clunky" too. What if it passes exactly in the middle between two points? Also I already had written code that weighs the four neighbor pixels by their closeness for an other project so I am going with that. Also I take the liberty of not rescaling the picture.
i,j = np.meshgrid(np.arange(41),np.arange(11))
Z = i*j + j
class Image_knn():
def fit(self, image):
self.image = image.astype('float')
def predict(self, x, y):
image = self.image
weights_x = [1-(x % 1), x % 1]
weights_y = [1-(y % 1), y % 1]
start_x = np.floor(x).astype('int')
start_y = np.floor(y).astype('int')
return sum([image[np.clip(np.floor(start_x + x), 0, image.shape[0]-1).astype('int'),
np.clip(np.floor(start_y + y), 0, image.shape[1]-1).astype('int')] * weights_x[x]*weights_y[y]
for x,y in itertools.product(range(2),range(2))])
And a little sanity check it returns the picture if we give it it's coordinates.
image_model = Image_knn()
image_model.fit(Z)
assert np.allclose(image_model.predict(*np.where(np.ones(Z.shape, dtype='bool'))).reshape((11,41)), Z)
I generate m=100 lines and scale the points on them so that they are evenly spaced. Here is a plot of every 10th of them.
n = 1000
m = 100
slopes = np.linspace(1e-10,10,m)
t, slope = np.meshgrid(np.linspace(0,1,n), slopes)
x_max, y_max = Z.shape[0]-1, Z.shape[1]-1
lines_x = t
lines_y = t*slope
scales = np.broadcast_to(np.stack([x_max/lines_x[:,-1], y_max/lines_y[:,-1]]).min(axis=0), (n,m)).T
lines_x *= scales
lines_y *= scales
And finally I can get the "points" consisting of slope and "integral" and draw it. You probably should take a closer look at the "integral" it's just a ruff guess of mine.
%%timeit
points = np.array([(slope, np.mean(image_model.predict(lines_x[i],lines_y[i]))
*np.linalg.norm(np.array((lines_x[i,-1],lines_y[i,-1]))))
for i,slope in enumerate(slopes)])
plt.scatter(points[:,0],points[:,1])
Notice the %%timeit in the last block. This takes ~38.3 ms on my machine and therefore wasn't optimized. As Donald Knuth puts it "premature optimization is the root of all evil". If you were to optimize this you would remove the for loop, shove all the coordinates for line points in the model at once by reshaping and reshaping back and then organize them with the slopes. But I saw no reason to put myself threw that for a few ms.
And finally we get a nice cusp as a reward. Notice that it makes sense that the maximum is at 4 since the diagonal is at a slope of 4 for our 40 by 10 picture. The intuition for the cusp is a bit harder to explain but I guess you probably have that already. For the length it comes down to the function (x,y) -> sqrt(x^2+y^2) having different directional differentials when going up and when going left on the rectangle.
I have trajectory data, where each trajectory consists of a sequence of coordinates(x, y points) and each trajectory is identified by a unique ID.
These trajectories are in x - y plane, and I want to divide the whole plane into equal sized grid (square grid). This grid is obviously invisible but is used to divide trajectories into sub-segments. Whenever a trajectory intersects with a grid line, it is segmented there and becomes a new sub-trajectory with new_id.
I have included a simple handmade graph to make clear what I am expecting.
It can be seen how the trajectory is divided at the intersections of the grid lines, and each of these segments has new unique id.
I am working on Python, and seek some python implementation links, suggestions, algorithms, or even a pseudocode for the same.
Please let me know if anything is unclear.
UPDATE
In order to divide the plane into grid , cell indexing is done as following:
#finding cell id for each coordinate
#cellid = (coord / cellSize).astype(int)
cellid = (coord / 0.5).astype(int)
cellid
Out[] : array([[1, 1],
[3, 1],
[4, 2],
[4, 4],
[5, 5],
[6, 5]])
#Getting x-cell id and y-cell id separately
x_cellid = cellid[:,0]
y_cellid = cellid[:,1]
#finding total number of cells
xmax = df.xcoord.max()
xmin = df.xcoord.min()
ymax = df.ycoord.max()
ymin = df.ycoord.min()
no_of_xcells = math.floor((xmax-xmin)/ 0.5)
no_of_ycells = math.floor((ymax-ymin)/ 0.5)
total_cells = no_of_xcells * no_of_ycells
total_cells
Out[] : 25
Since the plane is now divided into 25 cells each with a cellid. In order to find intersections, maybe I could check the next coordinate in the trajectory, if the cellid remains the same, then that segment of the trajectory is in the same cell and has no intersection with grid. Say, if x_cellid[2] is greater than x_cellid[0], then segment intersects vertical grid lines. Though, I am still unsure how to find the intersections with the grid lines and segment the trajectory on intersections giving them new id.
This can be solved by shapely:
%matplotlib inline
import pylab as pl
from shapely.geometry import MultiLineString, LineString
import numpy as np
from matplotlib.collections import LineCollection
x0, y0, x1, y1 = -10, -10, 10, 10
n = 11
lines = []
for x in np.linspace(x0, x1, n):
lines.append(((x, y0), (x, y1)))
for y in np.linspace(y0, y1, n):
lines.append(((x0, y), (x1, y)))
grid = MultiLineString(lines)
x = np.linspace(-9, 9, 200)
y = np.sin(x)*x
line = LineString(np.c_[x, y])
fig, ax = pl.subplots()
for i, segment in enumerate(line.difference(grid)):
x, y = segment.xy
pl.plot(x, y)
pl.text(np.mean(x), np.mean(y), str(i))
lc = LineCollection(lines, color="gray", lw=1, alpha=0.5)
ax.add_collection(lc);
The result:
To not use shapely, and do it yourself:
import pylab as pl
import numpy as np
from matplotlib.collections import LineCollection
x0, y0, x1, y1 = -10, -10, 10, 10
n = 11
xgrid = np.linspace(x0, x1, n)
ygrid = np.linspace(y0, y1, n)
x = np.linspace(-9, 9, 200)
y = np.sin(x)*x
t = np.arange(len(x))
idx_grid, idx_t = np.where((xgrid[:, None] - x[None, :-1]) * (xgrid[:, None] - x[None, 1:]) <= 0)
tx = idx_t + (xgrid[idx_grid] - x[idx_t]) / (x[idx_t+1] - x[idx_t])
idx_grid, idx_t = np.where((ygrid[:, None] - y[None, :-1]) * (ygrid[:, None] - y[None, 1:]) <= 0)
ty = idx_t + (ygrid[idx_grid] - y[idx_t]) / (y[idx_t+1] - y[idx_t])
t2 = np.sort(np.r_[t, tx, tx, ty, ty])
x2 = np.interp(t2, t, x)
y2 = np.interp(t2, t, y)
loc = np.where(np.diff(t2) == 0)[0] + 1
xlist = np.split(x2, loc)
ylist = np.split(y2, loc)
fig, ax = pl.subplots()
for i, (xp, yp) in enumerate(zip(xlist, ylist)):
pl.plot(xp, yp)
pl.text(np.mean(xp), np.mean(yp), str(i))
lines = []
for x in np.linspace(x0, x1, n):
lines.append(((x, y0), (x, y1)))
for y in np.linspace(y0, y1, n):
lines.append(((x0, y), (x1, y)))
lc = LineCollection(lines, color="gray", lw=1, alpha=0.5)
ax.add_collection(lc);
You're asking a lot. You should attack most of the design and coding yourself, once you have a general approach. Algorithm identification is reasonable for Stack Overflow; asking for design and reference links is not.
I suggest that you put the point coordinates into a list. use the NumPy and SciKit capabilities to interpolate the grid intersections. You can store segments in a list (of whatever defines a segment in your data design). Consider making a dictionary that allows you to retrieve the segments by grid coordinates. For instance, if segments are denoted only by the endpoints, and points are a class of yours, you might have something like this, using the lower-left corner of each square as its defining point:
grid_seg = {
(0.5, 0.5): [p0, p1],
(1.0, 0.5): [p1, p2],
(1.0, 1.0): [p2, p3],
...
}
where p0, p1, etc. are the interpolated crossing points.
Each trajectory is composed of a series of straight line segments. You therefore need a routine to break each line segment into sections that lie completely within a grid cell. The basis for such a routine would be the Digital Differential Analyzer (DDA) algorithm, though you'll need to modify the basic algorithm since you need endpoints of the line within each cell, not just which cells are visited.
A couple of things you have to be careful of:
1) If you're working with floating point numbers, beware of rounding errors in the calculation of the step values, as these can cause the algorithm to fail. For this reason many people choose to convert to an integer grid, obviously with a loss of precision. This is a good discussion of the issues, with some working code (though not python).
2) You'll need to decide which of the 4 grid lines surrounding a cell belong to the cell. One convention would be to use the bottom and left edges. You can see the issue if you consider a horizontal line segment that falls on a grid line - does its segments belong to the cell above or the cell below?
Cheers
data = list of list of coordinates
For point_id, point_coord in enumerate(point_coord_list):
if current point & last point stayed in same cell:
append point's index to last list of data
else:
append a new empty list to data
interpolate the two points and add a new point
that is on the grid lines.
Data stores all trajectories. Each list within the data is a trajectory.
The cell index along x and y axes (x_cell_id, y_cell_id) can be found by dividing coordinate of point by dimension of cell, then round to integer. If the cell indices of current point are same as that of last points, then these two points are in the same cell. list is good for inserting new points but it is not as memory efficient as arrays.
Might be a good idea to create a class for trajectory. Or use a memory buffer and sparse data structure instead of list and list and an array for the x-y coordinates if the list of coordinates wastes too much memory.
Inserting new points into array is slow, so we can use another array for new points.
Warning: I haven't thought too much about the things below. It probably has bugs, and someone needs to fill in the gaps.
# coord n x 2 numpy array.
# columns 0, 1 are x and y coordinate.
# row n is for point n
# cell_size length of one side of the square cell.
# n_ycells number of cells along the y axis
import numpy as np
cell_id_2d = (coord / cell_size).astype(int)
x_cell_id = cell_id_2d[:,0]
y_cell_id = cell_id_2d[:,1]
cell_id_1d = x_cell_id + y_cell_id*n_x_cells
# if the trajectory exits a cell, its cell id changes
# and the delta_cell_id is not zero.
delta_cell_id = cell_id_1d[1:] - cell_id_1d[:-1]
# The nth trajectory should contains the points from
# the (crossing_id[n])th to the (crossing_id[n + 1] - 1)th
w = np.where(delta_cell_id != 0)[0]
crossing_ids = np.empty(w.size + 1)
crossing_ids[1:] = w
crossing_ids[0] = 0
# need to interpolate when the trajectory cross cell boundary.
# probably can replace this loop with numpy functions/indexing
new_points = np.empty((w.size, 2))
for i in range(1, n):
st = coord[crossing_ids[i]]
en = coord[crossing_ids[i+1]]
# 1. check which boundary of the cell is crossed
# 2. interpolate
# 3. put points into new_points
# Each trajectory contains some points from coord array and 2 points
# in the new_points array.
For retrieval, make a sparse array that contains the index of the starting point in the coord array.
Linear interpolation can look bad if the cell size is large.
Further explanation:
Description of the grid
For n_xcells = 4, n_ycells = 3, the grid is:
0 1 2 3 4
0 [ ][ ][ ][ ][ ]
1 [ ][ ][ ][* ][ ]
2 [ ][ ][ ][ ][ ]
[* ] has an x_index of 3 and a y_index of 1.
There are (n_x_cells * n_y_cells) cells in the grid.
Relationship between point and cell
The cell that contains the ith point of the trajectory has an x_index of x_cell_id[i] and a y_index of x_cell_id[i]. I get this by discretization through dividing the xy-coordinates of the points by the length of the cell and then truncate to integers.
The cell_id_1d of the cells are the number in [ ]
0 1 2 3 4
0 [0 ][1 ][2 ][3 ][4 ]
1 [5 ][6 ][7 ][8 ][9 ]
2 [10][11][12][13][14]
cell_id_1d[i] = x_cell_id[i] + y_cell_id[i]*n_x_cells
I converted the pair of cell indices (x_cell_id[i], y_cell_id[i]) for the ith point to a single index called cell_id_1d.
How to find if trajectory exit a cell at the ith point
Now, the ith and (i + 1)th points are in same cell, if and only if (x_cell_id[i], y_cell_id[i]) == (x_cell_id[i + 1], y_cell_id[i + 1]) and also cell_id_1d[i] == cell_id[i + 1], and cell_id[i + 1] - cell_id[i] == 0. delta_cell_ids[i] = cell_id_1d[i + 1] - cell_id[i], which is zero if and only the ith and (i + 1)th points are in the same cell.
I'm writing a basic Hough Transform in Python - I believe I have got it conceptually correct however, my result is appearing to be offset such that it is split top and bottom, rather than continuous. What I want to get should look like this:
But I get this:
Which is close, but seems to be split badly through the middle! I'm convinced this is due to my indexing of the rho/theta arrays, however despite my many alterations, I cannot get this to fix! Any explanation of my erroneous step and what I need to change would be very gratefully appreciated!
My source code should be complete and run directly...
Thanks very much
David
Source
import numpy as np
import matplotlib.pyplot as mpl
cols, rows = [256,256] # Set size of image
grey_levels = 256 #Grey levels in image
testPixels = [[0 for x in range(rows)] for y in range(cols)] # Convert to black and white
testPixels[100][100] = 255 #Set 3 pixels to white
testPixels[200][200] = 255
testPixels[150][150] = 255
rho_size = int(np.sqrt(rows**2 + cols**2)) #Max possible rho is diagonal dist.
angle_size = 360 #Test all angles
houghspace = [[0 for x in range(rho_size)] for y in range(angle_size)] # Create hough space array
for x in range(rows): # For each rows
for y in range(cols): # For each cols
if testPixels[x][y] == 0: #Skip if not edge point
continue
for theta in range(angle_size):
rho = int(x*np.cos(np.deg2rad(theta)) + y*np.sin(np.deg2rad(theta)))
houghspace[theta][rho] = 255
houghspace = [list(a) for a in zip(*houghspace)] #Transpose to get angle on x axis
fig = mpl.figure() # Create a figure
fig.add_subplot(1, 2, 1).set_title("Original")
mpl.imshow(np.uint8(np.dstack((testPixels,testPixels,testPixels))),cmap='Greys')
fig.add_subplot(1, 2, 2).set_title("Hough Transform")
mpl.imshow(np.uint8(np.dstack((houghspace, houghspace, houghspace))),cmap='Greys')
mpl.show()
You've mixed up the indices when you create houghspace as a list of lists. Please prefer using numpy arrays as it will make things much clearer with indices. Along the x-axis, the angle theta changes and along the y-axis the rho changes. But, you've got it the other way when defining houghspace using list comprehension.
Below is the correct code. Note the comments starting with ##
rho_size = int(np.sqrt(rows**2 + cols**2)) #Max possible rho is diagonal dist.
angle_size = 360 #Test all angles
##houghspace = [[0 for y in range(angle_size) for x in range(2*rho_size)]] #buggy
houghspace = [[0 for x in range(angle_size)] for y in range(rho_size*2)] #correct
## Also double the rho_size, so that both crust and trough of sinusoidal is visible
for x in range(rows): # For each rows
for y in range(cols): # For each cols
if testPixels[x][y] == 0: #Skip if not edge point
continue
for theta in range(angle_size):
rho = int(x*np.cos(np.deg2rad(theta)) + y*np.sin(np.deg2rad(theta))) \
+ rho_size ## also add rho_size
##houghspace[theta][rho] = 255 ## buggy
houghspace[rho][theta] += 255 # <==== indices switched & it's +=
##houghspace = [list(a) for a in zip(*houghspace)]
##Transposing not needed now (we switched indices)
fig = mpl.figure() # Create a figure
fig.add_subplot(1, 2, 1).set_title("Original")
mpl.imshow(np.uint8(np.dstack((testPixels,testPixels,testPixels))),cmap='Greys')
fig.add_subplot(1, 2, 2).set_title("Hough Transform")
mpl.imshow(np.uint8(np.dstack((houghspace, houghspace, houghspace))),cmap='Greys')
mpl.show()
I get the following plot:
I have a list of x and y values for two curves, both having weird shapes, and I don't have a function for any of them. I need to do two things:
Plot it and shade the area between the curves like the image below.
Find the total area of this shaded region between the curves.
I'm able to plot and shade the area between those curves with fill_between and fill_betweenx in matplotlib, but I have no idea on how to calculate the exact area between them, specially because I don't have a function for any of those curves.
Any ideas?
I looked everywhere and can't find a simple solution for this. I'm quite desperate, so any help is much appreciated.
Thank you very much!
EDIT: For future reference (in case anyone runs into the same problem), here is how I've solved this: connected the first and last node/point of each curve together, resulting in a big weird-shaped polygon, then used shapely to calculate the polygon's area automatically, which is the exact area between the curves, no matter which way they go or how nonlinear they are. Works like a charm! :)
Here is my code:
from shapely.geometry import Polygon
x_y_curve1 = [(0.121,0.232),(2.898,4.554),(7.865,9.987)] #these are your points for curve 1 (I just put some random numbers)
x_y_curve2 = [(1.221,1.232),(3.898,5.554),(8.865,7.987)] #these are your points for curve 2 (I just put some random numbers)
polygon_points = [] #creates a empty list where we will append the points to create the polygon
for xyvalue in x_y_curve1:
polygon_points.append([xyvalue[0],xyvalue[1]]) #append all xy points for curve 1
for xyvalue in x_y_curve2[::-1]:
polygon_points.append([xyvalue[0],xyvalue[1]]) #append all xy points for curve 2 in the reverse order (from last point to first point)
for xyvalue in x_y_curve1[0:1]:
polygon_points.append([xyvalue[0],xyvalue[1]]) #append the first point in curve 1 again, to it "closes" the polygon
polygon = Polygon(polygon_points)
area = polygon.area
print(area)
EDIT 2: Thank you for the answers. Like Kyle explained, this only works for positive values. If your curves go below 0 (which is not my case, as showed in the example chart), then you would have to work with absolute numbers.
The area calculation is straightforward in blocks where the two curves don't intersect: thats the trapezium as has been pointed out above. If they intersect, then you create two triangles between x[i] and x[i+1], and you should add the area of the two. If you want to do it directly, you should handle the two cases separately. Here's a basic working example to solve your problem. First, I will start with some fake data:
#!/usr/bin/python
import numpy as np
# let us generate fake test data
x = np.arange(10)
y1 = np.random.rand(10) * 20
y2 = np.random.rand(10) * 20
Now, the main code. Based on your plot, looks like you have y1 and y2 defined at the same X points. Then we define,
z = y1-y2
dx = x[1:] - x[:-1]
cross_test = np.sign(z[:-1] * z[1:])
cross_test will be negative whenever the two graphs cross. At these points, we want to calculate the x coordinate of the crossover. For simplicity, I will calculate x coordinates of the intersection of all segments of y. For places where the two curves don't intersect, they will be useless values, and we won't use them anywhere. This just keeps the code easier to understand.
Suppose you have z1 and z2 at x1 and x2, then we are solving for x0 such that z = 0:
# (z2 - z1)/(x2 - x1) = (z0 - z1) / (x0 - x1) = -z1/(x0 - x1)
# x0 = x1 - (x2 - x1) / (z2 - z1) * z1
x_intersect = x[:-1] - dx / (z[1:] - z[:-1]) * z[:-1]
dx_intersect = - dx / (z[1:] - z[:-1]) * z[:-1]
Where the curves don't intersect, area is simply given by:
areas_pos = abs(z[:-1] + z[1:]) * 0.5 * dx # signs of both z are same
Where they intersect, we add areas of both triangles:
areas_neg = 0.5 * dx_intersect * abs(z[:-1]) + 0.5 * (dx - dx_intersect) * abs(z[1:])
Now, the area in each block x[i] to x[i+1] is to be selected, for which I use np.where:
areas = np.where(cross_test < 0, areas_neg, areas_pos)
total_area = np.sum(areas)
That is your desired answer. As has been pointed out above, this will get more complicated if the both the y graphs were defined at different x points. If you want to test this, you can simply plot it (in my test case, y range will be -20 to 20)
negatives = np.where(cross_test < 0)
positives = np.where(cross_test >= 0)
plot(x, y1)
plot(x, y2)
plot(x, z)
plt.vlines(x_intersect[negatives], -20, 20)
Define your two curves as functions f and g that are linear by segment, e.g. between x1 and x2, f(x) = f(x1) + ((x-x1)/(x2-x1))*(f(x2)-f(x1)).
Define h(x)=abs(g(x)-f(x)). Then use scipy.integrate.quad to integrate h.
That way you don't need to bother about the intersections. It will do the "trapeze summing" suggested by ch41rmn automatically.
Your set of data is quite "nice" in the sense that the two sets of data share the same set of x-coordinates. You can therefore calculate the area using a series of trapezoids.
e.g. define the two functions as f(x) and g(x), then, between any two consecutive points in x, you have four points of data:
(x1, f(x1))-->(x2, f(x2))
(x1, g(x1))-->(x2, g(x2))
Then, the area of the trapezoid is
A(x1-->x2) = ( f(x1)-g(x1) + f(x2)-g(x2) ) * (x2-x1)/2 (1)
A complication arises that equation (1) only works for simply-connected regions, i.e. there must not be a cross-over within this region:
|\ |\/|
|_| vs |/\|
The area of the two sides of the intersection must be evaluated separately. You will need to go through your data to find all points of intersections, then insert their coordinates into your list of coordinates. The correct order of x must be maintained. Then, you can loop through your list of simply connected regions and obtain a sum of the area of trapezoids.
EDIT:
For curiosity's sake, if the x-coordinates for the two lists are different, you can instead construct triangles. e.g.
.____.
| / \
| / \
| / \
|/ \
._________.
Overlap between triangles must be avoided, so you will again need to find points of intersections and insert them into your ordered list. The lengths of each side of the triangle can be calculated using Pythagoras' formula, and the area of the triangles can be calculated using Heron's formula.
The area_between_two_curves function in pypi library similaritymeasures (released in 2018) might give you what you need. I tried a trivial example on my side, comparing the area between a function and a constant value and got pretty close tie-back to Excel (within 2%). Not sure why it doesn't give me 100% tie-back, maybe I am doing something wrong. Worth considering though.
I had the same problem.The answer below is based on an attempt by the question author. However, shapely will not directly give the area of the polygon in purple. You need to edit the code to break it up into its component polygons and then get the area of each. After-which you simply add them up.
Area Between two lines
Consider the lines below:
Sample Two lines
If you run the code below you will get zero for area because it takes the clockwise and subtracts the anti clockwise area:
from shapely.geometry import Polygon
x_y_curve1 = [(1,1),(2,1),(3,3),(4,3)] #these are your points for curve 1
x_y_curve2 = [(1,3),(2,3),(3,1),(4,1)] #these are your points for curve 2
polygon_points = [] #creates a empty list where we will append the points to create the polygon
for xyvalue in x_y_curve1:
polygon_points.append([xyvalue[0],xyvalue[1]]) #append all xy points for curve 1
for xyvalue in x_y_curve2[::-1]:
polygon_points.append([xyvalue[0],xyvalue[1]]) #append all xy points for curve 2 in the reverse order (from last point to first point)
for xyvalue in x_y_curve1[0:1]:
polygon_points.append([xyvalue[0],xyvalue[1]]) #append the first point in curve 1 again, to it "closes" the polygon
polygon = Polygon(polygon_points)
area = polygon.area
print(area)
The solution is therefore to split the polygon into smaller pieces based on where the lines intersect. Then use a for loop to add these up:
from shapely.geometry import Polygon
x_y_curve1 = [(1,1),(2,1),(3,3),(4,3)] #these are your points for curve 1
x_y_curve2 = [(1,3),(2,3),(3,1),(4,1)] #these are your points for curve 2
polygon_points = [] #creates a empty list where we will append the points to create the polygon
for xyvalue in x_y_curve1:
polygon_points.append([xyvalue[0],xyvalue[1]]) #append all xy points for curve 1
for xyvalue in x_y_curve2[::-1]:
polygon_points.append([xyvalue[0],xyvalue[1]]) #append all xy points for curve 2 in the reverse order (from last point to first point)
for xyvalue in x_y_curve1[0:1]:
polygon_points.append([xyvalue[0],xyvalue[1]]) #append the first point in curve 1 again, to it "closes" the polygon
polygon = Polygon(polygon_points)
area = polygon.area
x,y = polygon.exterior.xy
# original data
ls = LineString(np.c_[x, y])
# closed, non-simple
lr = LineString(ls.coords[:] + ls.coords[0:1])
lr.is_simple # False
mls = unary_union(lr)
mls.geom_type # MultiLineString'
Area_cal =[]
for polygon in polygonize(mls):
Area_cal.append(polygon.area)
Area_poly = (np.asarray(Area_cal).sum())
print(Area_poly)
A straightforward application of the area of a general polygon (see Shoelace formula) makes for a super-simple and fast, vectorized calculation:
def area(p):
# for p: 2D vertices of a polygon:
# area = 1/2 abs(sum(p0 ^ p1 + p1 ^ p2 + ... + pn-1 ^ p0))
# where ^ is the cross product
return np.abs(np.cross(p, np.roll(p, 1, axis=0)).sum()) / 2
Application to area between two curves. In this example, we don't even have matching x coordinates!
np.random.seed(0)
n0 = 10
n1 = 15
xy0 = np.c_[np.linspace(0, 10, n0), np.random.uniform(0, 10, n0)]
xy1 = np.c_[np.linspace(0, 10, n1), np.random.uniform(0, 10, n1)]
p = np.r_[xy0, xy1[::-1]]
>>> area(p)
4.9786...
Plot:
plt.plot(*xy0.T, 'b-')
plt.plot(*xy1.T, 'r-')
p = np.r_[xy0, xy1[::-1]]
plt.fill(*p.T, alpha=.2)
Speed
For both curves having 1 million points:
n = 1_000_000
xy0 = np.c_[np.linspace(0, 10, n), np.random.uniform(0, 10, n)]
xy1 = np.c_[np.linspace(0, 10, n), np.random.uniform(0, 10, n)]
%timeit area(np.r_[xy0, xy1[::-1]])
# 42.9 ms ± 140 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Simple viz of polygon area calculation
# say:
p = np.array([[0, 3], [1, 0], [3, 3], [1, 3], [1, 2]])
p_closed = np.r_[p, p[:1]]
fig, axes = plt.subplots(ncols=2, figsize=(10, 5), subplot_kw=dict(box_aspect=1), sharex=True)
ax = axes[0]
ax.set_aspect('equal')
ax.plot(*p_closed.T, '.-')
ax.fill(*p_closed.T, alpha=0.6)
center = p.mean(0)
txtkwargs = dict(ha='center', va='center')
ax.text(*center, f'{area(p):.2f}', **txtkwargs)
ax = axes[1]
ax.set_aspect('equal')
for a, b in zip(p_closed, p_closed[1:]):
ar = 1/2 * np.cross(a, b)
pos = ar >= 0
tri = np.c_[(0,0), a, b, (0,0)].T
# shrink a bit to make individual triangles easier to visually identify
center = tri.mean(0)
tri = (tri - center)*0.95 + center
c = 'b' if pos else 'r'
ax.plot(*tri.T, 'k')
ax.fill(*tri.T, c, alpha=0.2, zorder=2 - pos)
t = ax.text(*center, f'{ar:.1f}', color=c, fontsize=8, **txtkwargs)
t.set_bbox(dict(facecolor='white', alpha=0.8, edgecolor='none'))
plt.tight_layout()