Is there a better way than Pandas apply? - python

I'm trying to find the distance from each point to the nearest shoreline.
I have two data.
Latitude, longitude information for each point
About the shoreline
ex) sample_Data (Point Data) =
위도 경도
0 36.648365 127.486831
1 36.648365 127.486831
2 37.569615 126.819528
3 37.569615 126.819528
....
gdf =
0 LINESTRING (127.45000 34.45696, 127.44999 34.4...
1 LINESTRING (127.49172 34.87526, 127.49173 34.8...
2 LINESTRING (129.06340 37.61434, 129.06326 37.6...
...
def min_distance(x,y):
sreach_point = Point(x,y)
a = gdf.swifter.progress_bar(enable=True).apply(lambda x : geod.geometry_length(LineString(nearest_points(x['geometry'], sreach_point))),axis = 1)
return a.min()
sample_Data['거리']= sample_Data.apply(lambda x : min_distance(x['경도'],x['위도']),axis =1 ,result_type='expand')
This code takes longer than I thought, so I'm looking for a better way.
If I cross join both data frames, will the speed increase?
It takes about 6 hours to proceed with the above code.

you can use shapely for distance like :
import numpy as np
import pandas as pd
from shapely.geometry import Point, LineString
def min_distance(row, gdf):
sreach_point = Point(row['경도'], row['위도'])
a = gdf.swifter.progress_bar(enable=True).apply(
lambda x: geod.geometry_length(LineString(nearest_points(x['geometry'], sreach_point))), axis=1
)
return a.min()
sample_Data['거리'] = gdf.apply(min_distance, axis=1, args=(sample_Data,))

Related

Convert DataFrame to a multi polygon DataFrame, multiple data point - python

I have a DataFrame as below, I want to convert data to a multi polygon DataFrame, because I want to plot each multi polygon on a map.
I know how to convert if I have two data point, but with 6 data point, I don't know how to convert it. can anyone help me please.
geometry = [Point(xy) for xy in zip(neightrip_counts_.lan0, neightrip_counts_.long0)]
geometry
#neightrip_counts_.lan1, neightrip_counts_.long1,neightrip_counts_.lan2, neightrip_counts_.long2
lan0 long0 lan1 long1 lan2 long2
0 59.915667 10.777567 59.916738 10.779916 59.914943 10.773977
1 59.929853 10.711515 59.929435 10.713682 59.927596 10.710033
2 59.939230 10.759170 59.937205 10.760581 59.943750 10.760306
3 59.912520 10.762240 59.911594 10.761774 59.912347 10.763815
4 59.929634 10.732839 59.927140 10.730981 59.931081 10.736003
Let me rename the dataframe neightrip_counts_ as df for brevity. Here is the relevant code that will create a polygon for each row of dataframe.
df['geometry'] = [Polygon([(z[0],z[1]), (z[2],z[3]), (z[4],z[5])]) for z in zip(df.long0, df.lan0, df.long1, df.lan1, df.long2, df.lan2)]
gpdf = df.set_geometry("geometry", drop=True)
gpdf.plot()
By the way, you must be careful about the sequence of (long, lat).
start_coords = [ gdf.centroid[0].x, gdf.centroid[0].y] # is wrong
Use this in stead.
start_coords = [ gdf.centroid[0].y, gdf.centroid[0].x]
Edit
For the benefits of the readers, here is the complete runnable code:
import pandas as pd
import geopandas as gpd
from io import StringIO
from shapely.geometry import Polygon, Point, LineString
import numpy as np
import folium
data1 = """index lan0 long0 lan1 long1 lan2 long2
0 59.915667 10.777567 59.916738 10.779916 59.914943 10.773977
1 59.929853 10.711515 59.929435 10.713682 59.927596 10.710033
2 59.939230 10.759170 59.937205 10.760581 59.943750 10.760306
3 59.912520 10.762240 59.911594 10.761774 59.912347 10.763815
4 59.929634 10.732839 59.927140 10.730981 59.931081 10.736003"""
# read/parse data into dataframe
df0 = pd.read_csv(StringIO(data1), sep='\s+', index_col='index')
# create `geometry` column
df0['geometry'] = [Polygon([(xy[0],xy[1]), (xy[2],xy[3]), (xy[4],xy[5])]) \
for xy in zip(df0.long0, df0.lan0, df0.long1, df0.lan1, df0.long2, df0.lan2)]
# set geometry
gpdf = df0.set_geometry("geometry", drop=True)
# do check plot. (uncomment next line)
#gpdf.plot()
# make geojson
center_pt = gpdf.centroid[0].y, gpdf.centroid[0].x
gdf_json = gpdf.to_json()
# plot the geojson on the folium webmap
webmap = folium.Map(location = center_pt, zoom_start = 13, min_zoom = 3)
folium.GeoJson(gdf_json, name='data_layer_1').add_to(webmap)
# this opens the webmap
webmap
Output screen capture (of interactive webmap):
Try this, assuming the 'lan' is latitude.
import geopandas as gpd
from shapely.geometry import Polygon
import numpy as np
import pandas as pd
import folium
# ....
def addpolygeom(row):
row_array = np.array(row)
# split dataframe row to a list of tuples (lat, lon)
coords = [tuple(i)[::-1] for i in np.split(row_array, range(2, row_array.shape[0], 2))]
polygon = Polygon(coords)
return polygon
# Convert points to shapely geometry
neightrip_counts_['geometry'] = neightrip_counts_.apply(lambda x: addpolygeom(x), axis=1)
# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(neightrip_counts_, geometry='geometry')
start_coords = [ gdf.centroid[0].y, gdf.centroid[0].x]
gdf_json = gdf.to_json()
map = folium.Map(start_coords, zoom_start=4)
folium.GeoJson(gdf_json, name='mypolygons').add_to(map)

How to optimize Shapely and Sklearn code?

I am working with a dataset of 4.2 millions points and my codes is already taking a while to process, however below code is taking several hours to process (the code was provided in other public question and basically it takes the nearest linestring to a point, finds the nearest point from that line string and calculus the distance)
The codes actually does an awesome job, but takes too long for its purposes, How I can optimize or do the same thing in a shortest time?
import geopandas as gpd
import numpy as np
from shapely.geometry import Point, LineString
from shapely.ops import nearest_points
from sklearn.neighbors import DistanceMetric
EARTH_RADIUS_IN_MILES = 3440.1 #NAUTICAL MILES
panama = gpd.read_file("/Users/Danilo/Documents/Python/panama_coastline/panama_coastline.shp")
for c in range(b):
#p = Point(-77.65325423107359,9.222038196656131)
p=Point(data['longitude'][c],data['latitude'][c])
def closest_line(point, linestrings):
return np.argmin( [p.distance(linestring) for linestring in panama.geometry] )
closest_linestring = panama.geometry[ closest_line(p, panama.geometry) ]
closest_linestring
closest_point = nearest_points(p, closest_linestring)
dist = DistanceMetric.get_metric('haversine')
points_as_floats = [ np.array([p.x, p.y]) for p in closest_point ]
haversine_distances = dist.pairwise(np.radians(points_as_floats), np.radians(points_as_floats) )
haversine_distances *= EARTH_RADIUS_IN_MILES
dtc1=haversine_distances[0][1]
dtc.append(dtc1)
Edit: Simplify to single calculation with BallTree
Imports
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point, LineString
from shapely.ops import nearest_points
Read Panama
panama = gpd.read_file("panama_coastline/panama_coastline.shp")
Get all points, long,lat format:
def get_points_as_numpy(geom):
work_list = []
for g in geom:
work_list.append( np.array(g.coords) )
return np.concatenate(work_list)
all_coastline_points = get_points_as_numpy(panama.geometry)
Create Balltree
from sklearn.neighbors import BallTree
import numpy as np
panama_radians = np.radians(np.flip(all_coastline_points,axis=1))
tree = BallTree(panama_radians, leaf_size=12, metric='haversine')
Create 1M random points:
mean = [8.5,-80]
cov = [[1,0],[0,5]] # diagonal covariance, points lie on x or y-axis
random_gps = np.random.multivariate_normal(mean,cov,(10**6))
random_points = pd.DataFrame( {'lat' : random_gps[:,0], 'long' : random_gps[:,1]})
random_points.head()
Calculate closest coast point (<30 Seconds on my machine)
distances, index = tree.query( np.radians(random_gps), k=1)
Put results in DataFrame
EARTH_RADIUS_IN_MILES = 3440.1
random_points['distance_to_coast'] = distances * EARTH_RADIUS_IN_MILES
random_points['closest_lat'] = all_coastline_points[index][:,0,1]
random_points['closest_long'] = all_coastline_points[index][:,0,0]

Aggregating points using xarray

I have a set of netcdf datasets that basically look like a CSV file with columns for latitude, longitude, value. These are points along tracks that I want to aggregate to a regular grid of (say) 1 degree from -90 to 90 and -180 to 180 degrees, by for example calculating the mean and/or standard deviation of all points that fall within a given cell.
This is quite easily done with a loop
D = np.zeros((180, 360))
for ilat in np.arange(-90, 90, 1, dtype=np.int):
for ilon in np.arange(-180, 180, 1, dtype=np.int):
p1 = np.logical_and(ds.lat >= ilat,
ds.lat <= ilat + 1)
p2 = np.logical_and(ds.lon >=ilon,
ds.lon <= ilon+1)
if np.sum(p1*p2) == 0:
D[90 + ilat, 180 +ilon] = np.nan
else:
D[90 + ilat, 180 + ilon] = np.mean(ds.var.values[p1*p2])
# D[90 + ilat, 180 + ilon] = np.std(ds.var.values[p1*p2])
Other than using numba/cython to speed this up, I was wondering whether this is something you can directly do with xarray in a more efficient way?
You should be able to solve this using pandas and xarray.
You will first need to convert your data set to a pandas data frame.
Once this is done, df is the dataframe and assuming longitude and latitude are lon/lat, you will need to round the lon/lats to the nearest integer value, and then calculate the mean for each lon/lat. You will then need to set lon/lat to indices. Then you can use xarray's to_xarray to convert to an array:
import xarray as xr
import pandas as pd
import numpy as np
df = df.assign(lon = lambda x: np.round(x.lon))
df = df.assign(lat = lambda x: np.round(x.lat))
df = df.groupby(["lat", "lon"]).mean()
df = df.set_index(["lat", "lon"])
df.to_xarray()
I use #robert-wilson as a starting point, and to_xarray is indeed part of my solution. Other inspiration came from here. The approach that I used is shown below. It's probably slower than numba-ing my solution above, but much simpler.
import netCDF4
import numpy as np
import xarray as xr
import pandas as pd
fname = "super_funky_file.nc"
f = netCDF4.Dataset(fname)
lat = f.variables['lat'][:]
lon = f.variables['lon'][:]
vari = f.variables['super_duper_variable'][:]
df = pd.DataFrame({"lat":lat,
"lon":lon,
"vari":vari})
# Simple functions to calculate the grid location in rows/cols
# using lat/lon as inputs. Global 0.5 deg grid
# Remember to cast to integer
to_col = lambda x: np.floor(
(x+90)/0.5).astype(
np.int)
to_row = lambda x: np.floor(
(x+180.)/0.5).astype(
np.int)
# Map the latitudes to columns
# Map the longitudes to rows
df['col'] = df.lat.map(to_col)
df['row'] = df.lon.map(to_row)
# Aggregate by row and col
gg = df.groupby(['col', 'row'])
# Now, create an xarray dataset with
# the mean of vari per grid cell
ds = gg.mean().to_xarray()
dx = gg.std().to_xarray()
ds['stdi'] = dx['vari']
dx = gg.count().to_xarray()
ds['counti'] = dx['vari']```

How to use geopanda or shapely to find nearest point in same geodataframe

I have a geodataframe showing ~25 locations represented as point geometry. I am trying to come up with a script that goes through each point, identifies the nearest location and returns the name of the nearest location and the distance.
I can easily do this if I have different geodataframes using nearest_points(geom1, geom2) in the shapely.ops library. However all my locations are stored in one geodataframe. I am trying to loop through and that is where I am having trouble
here is my sample file:
geofile = gpd.GeoDataFrame([[0, 'location A', Point(55, 55)],
[1, 'location B', Point(66, 66)],
[2, 'Location C', Point(99, 99)],
[3, 'Location D', Point(11, 11)]],
columns=['ID','Location','geometry'])
Here is the loop I am creating to no avail.
for index, row in geofile.iterrows():
nearest_geoms=nearest_points(row, geofile)
print('location:' + nearest_geoms[0])
print('nearest:' + nearest_geoms[1])
print('-------')
I am getting this error:
AttributeError: 'Series' object has no attribute '_geom'
However I think my problem is beyond the error cause somehow I have to exclude the row I am looping through cause that will automatically return as the closest location since it is that location.
My end result for one location would be the following:
([0,'location A','location B', '5 miles', Point(55,55)], columns=['ID','Location','Nearest', 'Distance',geometry'])
Shapely's nearest_points function compares shapely geometries. To compare a single Point geometry against multiple other Point geometries, you can use .unary_union to compare against the resulting MultiPoint geometry. And yes, at each row operation, drop the respective point so it is not compared against itself.
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points
df = gpd.GeoDataFrame([[0, 'location A', Point(55,55)],
[1, 'location B', Point(66,66)],
[2, 'Location C', Point(99,99)],
[3, 'Location D' ,Point(11,11)]],
columns=['ID','Location','geometry'])
df.insert(3, 'nearest_geometry', None)
for index, row in df.iterrows():
point = row.geometry
multipoint = df.drop(index, axis=0).geometry.unary_union
queried_geom, nearest_geom = nearest_points(point, multipoint)
df.loc[index, 'nearest_geometry'] = nearest_geom
Resulting in
ID Location geometry nearest_geometry
0 0 location A POINT (55 55) POINT (66 66)
1 1 location B POINT (66 66) POINT (55 55)
2 2 Location C POINT (99 99) POINT (66 66)
3 3 Location D POINT (11 11) POINT (55 55)
The following approach using sklearn.neighbors.NearestNeighbors accomplishes this task with two lines of code and scales rather well (both in terms of the number of points and the number of neighbors):
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.neighbors import NearestNeighbors
N_POINTS = 10_000
N_NEIGHBORS = 10
# generate larger dataframe with random points:
np.random.seed(23)
acoords = np.random.randint(0, 1000, (N_POINTS, 2))
df = gpd.GeoDataFrame({"ID": range(N_POINTS)}, geometry=gpd.points_from_xy(acoords[:, 0], acoords[:, 1]))
# 2d numpy array of the coordinates
coords = np.array(df.geometry.map(lambda p: [p.x, p.y]).tolist())
# "train"/initialize the NearestNeighbors model
# NOTE: N_NEIGHBORS + 1 since we are dropping the nearest point
# (which is each point itself with distance 0)
knn = NearestNeighbors(n_neighbors=N_NEIGHBORS + 1, algorithm='kd_tree').fit(coords)
# retrieve neighbors (distance and index)
knn_dist, knn_idx = knn.kneighbors(coords)
# add results to dataframe:
df[list(map("NEIGHBOR_{}".format, range(1, N_NEIGHBORS + 1)))] = \
df.geometry.values.to_numpy()[knn_idx[:, 1:]]
print(df)
Result:
ID geometry NEIGHBOR_1 ... NEIGHBOR_8 \
0 0 POINT (595.000 742.000) POINT (597 737) ... POINT (592 756)
1 1 POINT (40.000 969.000) POINT (40 971) ... POINT (27 961)
... ... ... ... ... ...
9998 9998 POINT (38.000 508.000) POINT (34 507) ... POINT (50 516)
9999 9999 POINT (891.000 936.000) POINT (887 931) ... POINT (876 929)
NEIGHBOR_9 NEIGHBOR_10
0 POINT (598 727) POINT (606 730)
1 POINT (31 954) POINT (37 987)
... ... ...
9998 POINT (29 496) POINT (23 511)
9999 POINT (908 930) POINT (901 951)
[10000 rows x 12 columns]
Older/obsolete answers:
Here is another approach based on scipy.spatial.distance.cdist. The iterrows is avoided by using numpy masked arrays.
import geopandas as gpd
from scipy.spatial import distance
import numpy.ma as ma
from shapely.geometry import Point
import numpy as np
df = gpd.GeoDataFrame([[0, 'location A', Point(55,55)],
[1, 'location B', Point(66,66)],
[2, 'Location C', Point(99,99)],
[3, 'Location D' ,Point(11,11)]],
columns=['ID','Location','geometry'])
coords = np.stack(df.geometry.apply(lambda x: [x.x, x.y]))
distance_matrix = ma.masked_where((dist := distance.cdist(*[coords] * 2)) == 0, dist)
df["closest_ID"] = np.argmin(distance_matrix, axis=0)
df = df.join(df.set_index("ID").geometry.rename("nearest_geometry"), on="closest_ID")
df.drop("closest_ID", axis=1)
# Out:
ID Location geometry nearest_geometry
0 0 location A POINT (55.000 55.000) POINT (66.00000 66.00000)
1 1 location B POINT (66.000 66.000) POINT (55.00000 55.00000)
2 2 Location C POINT (99.000 99.000) POINT (66.00000 66.00000)
3 3 Location D POINT (11.000 11.000) POINT (55.00000 55.00000)
Generalization for multiple neighbors
Since the distance_matrix contains the complete information on distances between all pairs of points, it is easy to generalize this approach to arbitrary numbers of neighbors. For example, if we are interested in finding the N_NEAREST = 2 neighbors for each point, we can sort the distance matrix (using np.argsort, instead of picking the np.argmin, as before) and select the corresponding number of columns:
nearest_id_cols = list(map("nearest_id_{}".format, range(1, N_NEAREST + 1)))
nearest_geom_cols = list(map("nearest_geometry_{}".format, range(1, N_NEAREST + 1)))
df[nearest_id_cols] = np.argsort(distance_matrix, axis=1)[:, :N_NEAREST]
df[nearest_geom_cols] = df[nearest_id_cols].applymap(
lambda x: df.set_index("ID").geometry[x])
# out:
ID Location geometry nearest_id_1 nearest_id_2 \
0 0 location A POINT (55.00000 55.00000) 1 2
1 1 location B POINT (66.00000 66.00000) 0 2
2 2 Location C POINT (99.00000 99.00000) 1 0
3 3 Location D POINT (11.00000 11.00000) 0 1
nearest_geometry_1 nearest_geometry_2
0 POINT (66 66) POINT (99 99)
1 POINT (55 55) POINT (99 99)
2 POINT (66 66) POINT (55 55)
3 POINT (55 55) POINT (66 66)

Speed up a curve_fit in Pandas DataFrame

I have a dataframe with independent variables in the column headers, and each rows is a seperate set of dependent variables:
5.032530 6.972868 8.888268 10.732009 12.879130 16.877655
0 2.512298 2.132748 1.890665 1.583538 1.582968 1.440091
1 5.628667 4.206962 4.179009 3.162677 3.132448 1.887631
2 3.177090 2.274014 2.412432 2.066641 1.845065 1.574748
3 5.060260 3.793109 3.129861 2.617136 2.703114 1.921615
4 4.153010 3.354411 2.706463 2.570981 2.020634 1.646298
I would like to fit a curve of type Y=A*x^B to each row. I need to solve for A & B for about ~5000 rows, 6 datapoints in each row. I was able to do this using np.apply, but it takes about 40 seconds to do this. Can I speed up using Cython or by vectorizing somehow? I need precision to about 4 decimals
Here is what i have:
import pandas as pd
from scipy.optimize import curve_fit
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv(r'C:\File.csv')
def curvefita(y):
return curve_fit(lambda x,a,b: a*np.power(x,b), df.iloc[:,3:].columns, y,p0=[8.4,-.58], bounds=([0,-10],[200,10]),maxfev=2000)[0][0]
def curvefitb(y):
return curve_fit(lambda x,a,b: a*np.power(x,b), df.iloc[:,3:].columns, y,p0=[8.4,-.58], bounds=([0,-10],[200,10]),maxfev=2000)[0][1]
avalues = df.iloc[:,3:].apply(curvefita, axis=1)
bvalues = df.iloc[:,3:].apply(curvefitb, axis=1)
df['a']=avalues
df['b']=bvalues
colcount = len(df.columns)
#build power fit - make the matrix
powerfit = df.copy()
for column in range(colcount-2):
powerfit.iloc[:,column] = powerfit.iloc[:,colcount-2] * (powerfit.columns[column]**powerfit.iloc[:,colcount-1])
#graph an example
plt.plot(powerfit.iloc[0,:colcount-2],'r')
plt.plot(df.iloc[0,:colcount-2],'ro')
#another example looked up by ticker
plt.plot(powerfit.iloc[5,:colcount-2],'b')
plt.plot(df.iloc[5,:colcount-2],'bo')
You actually do two curve_fits per row, one for a and one for b. Try to find a way to insert both of them at the same time, so you can halve your execution time:
def func(x, a, b):
return a * np.power(x, b)
def curvefit(y):
return tuple(curve_fit(func, df.iloc[:,3:].columns, y ,p0=[8.4, -.58], bounds=([0, -10], [200, 10]))[0])
df[["a", "b"]] = df.iloc[:,3:].apply(curvefit, axis=1).apply(pd.Series)
print(df)
# 5.03253 6.972868 8.888268 10.732009 12.87913 16.877655 a \
# 0 2.512298 2.132748 1.890665 1.583538 1.582968 1.440091 2.677070
# 1 5.628667 4.206962 4.179009 3.162677 3.132448 1.887631 39.878792
# 2 3.177090 2.274014 2.412432 2.066641 1.845065 1.574748 8.589886
# 3 5.060260 3.793109 3.129861 2.617136 2.703114 1.921615 13.078827
# 4 4.153010 3.354411 2.706463 2.570981 2.020634 1.646298 27.715207
# b
# 0 -0.215338
# 1 -1.044384
# 2 -0.600827
# 3 -0.656381
# 4 -1.008753
And to make this more reusable, I would make curvefit also take the x-values and function, which can be passed in with functools.partial:
from functools import partial
def curvefit(func, x, y):
return tuple(curve_fit(func, x, y ,p0=[8.4, -.58], bounds=([0, -10], [200, 10]))[0])
fit = partial(curvefit, func, df.iloc[:,3:].columns)
df[["a", "b"]] = df.iloc[:,3:].apply(fit, axis=1).apply(pd.Series)
I was able to bring my runtime down to 550ms by following the advice of #Brenlla. This code uses an unweighted/biased formula similar to Excel, which is good enough for my purposes (#kennytm discusses it here)
df = pd.read_csv(r'C:\File.csv')
df2=np.log(df)
df3=df2.iloc[:,3:].copy()
df3.columns=np.log(df3.columns)
def curvefit(y):
return tuple(np.polyfit(df3.columns, y ,1))
df[["b", "a"]] = df3.apply(curvefit,axis=1).apply(pd.Series)
df['a']=np.exp(df['a'])
colcount = len(df.columns)
powerfit = df.copy()
for column in range(colcount-2):
powerfit.iloc[:,column] = powerfit.iloc[:,colcount-1] * (powerfit.columns[column]**powerfit.iloc[:,colcount-2])
#graph an example
plt.plot(powerfit.iloc[0,:colcount-2],'r')
plt.plot(df.iloc[0,:colcount-2],'ro')
#another example looked up by ticker
plt.plot(powerfit.iloc[5,:colcount-2],'b')
plt.plot(df.iloc[5,:colcount-2],'bo')

Categories