Convert DataFrame to a multi polygon DataFrame, multiple data point - python - python

I have a DataFrame as below, I want to convert data to a multi polygon DataFrame, because I want to plot each multi polygon on a map.
I know how to convert if I have two data point, but with 6 data point, I don't know how to convert it. can anyone help me please.
geometry = [Point(xy) for xy in zip(neightrip_counts_.lan0, neightrip_counts_.long0)]
geometry
#neightrip_counts_.lan1, neightrip_counts_.long1,neightrip_counts_.lan2, neightrip_counts_.long2
lan0 long0 lan1 long1 lan2 long2
0 59.915667 10.777567 59.916738 10.779916 59.914943 10.773977
1 59.929853 10.711515 59.929435 10.713682 59.927596 10.710033
2 59.939230 10.759170 59.937205 10.760581 59.943750 10.760306
3 59.912520 10.762240 59.911594 10.761774 59.912347 10.763815
4 59.929634 10.732839 59.927140 10.730981 59.931081 10.736003

Let me rename the dataframe neightrip_counts_ as df for brevity. Here is the relevant code that will create a polygon for each row of dataframe.
df['geometry'] = [Polygon([(z[0],z[1]), (z[2],z[3]), (z[4],z[5])]) for z in zip(df.long0, df.lan0, df.long1, df.lan1, df.long2, df.lan2)]
gpdf = df.set_geometry("geometry", drop=True)
gpdf.plot()
By the way, you must be careful about the sequence of (long, lat).
start_coords = [ gdf.centroid[0].x, gdf.centroid[0].y] # is wrong
Use this in stead.
start_coords = [ gdf.centroid[0].y, gdf.centroid[0].x]
Edit
For the benefits of the readers, here is the complete runnable code:
import pandas as pd
import geopandas as gpd
from io import StringIO
from shapely.geometry import Polygon, Point, LineString
import numpy as np
import folium
data1 = """index lan0 long0 lan1 long1 lan2 long2
0 59.915667 10.777567 59.916738 10.779916 59.914943 10.773977
1 59.929853 10.711515 59.929435 10.713682 59.927596 10.710033
2 59.939230 10.759170 59.937205 10.760581 59.943750 10.760306
3 59.912520 10.762240 59.911594 10.761774 59.912347 10.763815
4 59.929634 10.732839 59.927140 10.730981 59.931081 10.736003"""
# read/parse data into dataframe
df0 = pd.read_csv(StringIO(data1), sep='\s+', index_col='index')
# create `geometry` column
df0['geometry'] = [Polygon([(xy[0],xy[1]), (xy[2],xy[3]), (xy[4],xy[5])]) \
for xy in zip(df0.long0, df0.lan0, df0.long1, df0.lan1, df0.long2, df0.lan2)]
# set geometry
gpdf = df0.set_geometry("geometry", drop=True)
# do check plot. (uncomment next line)
#gpdf.plot()
# make geojson
center_pt = gpdf.centroid[0].y, gpdf.centroid[0].x
gdf_json = gpdf.to_json()
# plot the geojson on the folium webmap
webmap = folium.Map(location = center_pt, zoom_start = 13, min_zoom = 3)
folium.GeoJson(gdf_json, name='data_layer_1').add_to(webmap)
# this opens the webmap
webmap
Output screen capture (of interactive webmap):

Try this, assuming the 'lan' is latitude.
import geopandas as gpd
from shapely.geometry import Polygon
import numpy as np
import pandas as pd
import folium
# ....
def addpolygeom(row):
row_array = np.array(row)
# split dataframe row to a list of tuples (lat, lon)
coords = [tuple(i)[::-1] for i in np.split(row_array, range(2, row_array.shape[0], 2))]
polygon = Polygon(coords)
return polygon
# Convert points to shapely geometry
neightrip_counts_['geometry'] = neightrip_counts_.apply(lambda x: addpolygeom(x), axis=1)
# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(neightrip_counts_, geometry='geometry')
start_coords = [ gdf.centroid[0].y, gdf.centroid[0].x]
gdf_json = gdf.to_json()
map = folium.Map(start_coords, zoom_start=4)
folium.GeoJson(gdf_json, name='mypolygons').add_to(map)

Related

Is there a better way than Pandas apply?

I'm trying to find the distance from each point to the nearest shoreline.
I have two data.
Latitude, longitude information for each point
About the shoreline
ex) sample_Data (Point Data) =
위도 경도
0 36.648365 127.486831
1 36.648365 127.486831
2 37.569615 126.819528
3 37.569615 126.819528
....
gdf =
0 LINESTRING (127.45000 34.45696, 127.44999 34.4...
1 LINESTRING (127.49172 34.87526, 127.49173 34.8...
2 LINESTRING (129.06340 37.61434, 129.06326 37.6...
...
def min_distance(x,y):
sreach_point = Point(x,y)
a = gdf.swifter.progress_bar(enable=True).apply(lambda x : geod.geometry_length(LineString(nearest_points(x['geometry'], sreach_point))),axis = 1)
return a.min()
sample_Data['거리']= sample_Data.apply(lambda x : min_distance(x['경도'],x['위도']),axis =1 ,result_type='expand')
This code takes longer than I thought, so I'm looking for a better way.
If I cross join both data frames, will the speed increase?
It takes about 6 hours to proceed with the above code.
you can use shapely for distance like :
import numpy as np
import pandas as pd
from shapely.geometry import Point, LineString
def min_distance(row, gdf):
sreach_point = Point(row['경도'], row['위도'])
a = gdf.swifter.progress_bar(enable=True).apply(
lambda x: geod.geometry_length(LineString(nearest_points(x['geometry'], sreach_point))), axis=1
)
return a.min()
sample_Data['거리'] = gdf.apply(min_distance, axis=1, args=(sample_Data,))

Reverse Array in a dataframe

Hi I am trying to extract data from a netCDF file, but the data is upside down. How can I reverse the database:
The data I want to extract is the height data from the (netcdf) at the points I have in the CSV file. my Data:
import numpy as np
from netCDF4 import Dataset
import matplotlib.pyplot as plt
import pandas as pd
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Path, PathPatch
csv_data = np.loadtxt('CSV with target coordinates',skiprows=1,delimiter=',')
num_el = csv_data[:,0]
lat = csv_data[:,1]
lon = csv_data[:,2]
value = csv_data[:,3]
data = Dataset("elevation Data",'r')
lon_range = data.variables['x_range'][:]
lat_range = data.variables['y_range'][:]
topo_range = data.variables['z_range'][:]
spacing = data.variables['spacing'][:]
dimension = data.variables['dimension'][:]
z = data.variables['z'][:]
lon_num = dimension[0]
lat_num = dimension[1]
etopo_lon = np.linspace(lon_range[0],lon_range[1],dimension[0])
etopo_lat = np.linspace(lat_range[0],lat_range[1],dimension[1])
topo = np.reshape(z, (lat_num, lon_num))
height = np.empty_like(num_el)
desired_lat_idx = np.empty_like(num_el)
desired_lon_idx = np.empty_like(num_el)
for i in range(len(num_el)):
tmp_lat = np.abs(etopo_lat - lat[i]).argmin()
tmp_lon = np.abs(etopo_lon - lon[i]).argmin()
desired_lat_idx[i] = tmp_lat
desired_lon_idx[i] = tmp_lon
height[i] = topo[tmp_lat,tmp_lon]
height[height<-10]=0
print(len(desired_lat_idx))
print(len(desired_lon_idx))
print(len(height))
dfl= pd.DataFrame({
'Latitude' : lat.reshape(-1),
'Longitude': lon.reshape(-1),
'Altitude': height.reshape(-1)
});
print(dfl)
# but the Lat should not be changed here (the dfl must be correct)
df =dfl
lat=np.array(df['Latitude'])
lon=np.array(df['Longitude'])
val=np.array(df['Altitude'])
m = basemap.Basemap(projection='robin', lon_0=0, lat_0=0, resolution='l',area_thresh=1000)
m.drawcoastlines(color = 'black')
x,y = m(lon,lat)
colormesh= m.contourf(x,y,val,100, tri=True, cmap = 'terrain')
plt.colorbar(location='bottom',pad=0.04,fraction=0.06)
plt.show()
I have already tried:
lat = csv_data[:,1]
lat= lat*(-1)
But this didn´t work
It's a plotting artifact().
Just do:
colormesh= m.contourf(x,y[::-1],val,100, tri=True, cmap = 'terrain')
y[::-1] will reverse the order of the y latitude elements (as opposed to the land-mass outlines; and while keeping the x longitude coordinates the same) and hence flip them.
I've often had this problem with plotting numpy image data in the past.
Your raw CSV data are unlikely to be flipped themselves (why would they be?). You should try sanity-checking them [I am not a domain expert I'm afraid]! Overlaying an actual coordinate grid can help with this.
Another way to do it is given here: Reverse Y-Axis in PyPlot
You could also therefore just do
ax = plt.gca()
ax.invert_yaxis()

Python: k-means clustering

I’m trying to use k-means clustering for the data of longitudes and latitudes of a .csv file but instead of plotting a graph I only want to get and print the centroids so that i can search them on google maps. Does anyone know how to code that?
import pandas as pd
import numpy as np
import csv
with open('fileName.csv', 'r') as infile:
csv_reader = csv.reader (infile, delimiter=',')
x = []
y = []
for row in csv_reader:
if row[3] != 'LONGITUDE':
x.append(float(row[3]))
y.append(float(row[4]))
df = pd.DataFrame({
'x': x,
'y': y
})
#implement x and y in k-means and print the centroids
I would suggest looking at the Shapely library.
from shapely.geometry import MultiPoint
import pandas as pd
import numpy as np
import csv
# Easy way to read your csv file in
df = pd.read_csv('fileName.csv').rename(columns={'LONGITUDE': 'x', 'LATITUDE': 'y'})
# Assumes you have a column 'cluster_id' that references the cluster id for each coordinate
cluster_ids = df['cluster_id'].unique()
kmeans_clusters = []
for cluster_id in cluster_ids:
# Filtered df for each cluster id
cluster_df = df.loc[df['cluster_id'] == cluster_id]
x_values = cluster_df['x'].tolist()
y_values = cluster_df['y'].tolist()
xy_pairs = [point for point in zip(x_values, y_values)]
kmeans_clusters.append(xy_pairs)
# Where kmeans_clusters is a list of your clusters, each containing a list of xy pairs
centroids = []
for cluster in kmeans_clusters:
if len(cluster) > 1:
# Create a convex hull, find the centroid
convex_hull = MultiPoint(cluster).convex_hull
centroid = convex_hull.centroid
# Unpack to tuple object
centroids.append(list(centroid.coords)[0])
else:
# Single point cluster, it is the centroid
centroids.append(cluster[0])
print(centroids)

Aggregating points using xarray

I have a set of netcdf datasets that basically look like a CSV file with columns for latitude, longitude, value. These are points along tracks that I want to aggregate to a regular grid of (say) 1 degree from -90 to 90 and -180 to 180 degrees, by for example calculating the mean and/or standard deviation of all points that fall within a given cell.
This is quite easily done with a loop
D = np.zeros((180, 360))
for ilat in np.arange(-90, 90, 1, dtype=np.int):
for ilon in np.arange(-180, 180, 1, dtype=np.int):
p1 = np.logical_and(ds.lat >= ilat,
ds.lat <= ilat + 1)
p2 = np.logical_and(ds.lon >=ilon,
ds.lon <= ilon+1)
if np.sum(p1*p2) == 0:
D[90 + ilat, 180 +ilon] = np.nan
else:
D[90 + ilat, 180 + ilon] = np.mean(ds.var.values[p1*p2])
# D[90 + ilat, 180 + ilon] = np.std(ds.var.values[p1*p2])
Other than using numba/cython to speed this up, I was wondering whether this is something you can directly do with xarray in a more efficient way?
You should be able to solve this using pandas and xarray.
You will first need to convert your data set to a pandas data frame.
Once this is done, df is the dataframe and assuming longitude and latitude are lon/lat, you will need to round the lon/lats to the nearest integer value, and then calculate the mean for each lon/lat. You will then need to set lon/lat to indices. Then you can use xarray's to_xarray to convert to an array:
import xarray as xr
import pandas as pd
import numpy as np
df = df.assign(lon = lambda x: np.round(x.lon))
df = df.assign(lat = lambda x: np.round(x.lat))
df = df.groupby(["lat", "lon"]).mean()
df = df.set_index(["lat", "lon"])
df.to_xarray()
I use #robert-wilson as a starting point, and to_xarray is indeed part of my solution. Other inspiration came from here. The approach that I used is shown below. It's probably slower than numba-ing my solution above, but much simpler.
import netCDF4
import numpy as np
import xarray as xr
import pandas as pd
fname = "super_funky_file.nc"
f = netCDF4.Dataset(fname)
lat = f.variables['lat'][:]
lon = f.variables['lon'][:]
vari = f.variables['super_duper_variable'][:]
df = pd.DataFrame({"lat":lat,
"lon":lon,
"vari":vari})
# Simple functions to calculate the grid location in rows/cols
# using lat/lon as inputs. Global 0.5 deg grid
# Remember to cast to integer
to_col = lambda x: np.floor(
(x+90)/0.5).astype(
np.int)
to_row = lambda x: np.floor(
(x+180.)/0.5).astype(
np.int)
# Map the latitudes to columns
# Map the longitudes to rows
df['col'] = df.lat.map(to_col)
df['row'] = df.lon.map(to_row)
# Aggregate by row and col
gg = df.groupby(['col', 'row'])
# Now, create an xarray dataset with
# the mean of vari per grid cell
ds = gg.mean().to_xarray()
dx = gg.std().to_xarray()
ds['stdi'] = dx['vari']
dx = gg.count().to_xarray()
ds['counti'] = dx['vari']```

using pycharts plot coordinates on map

I use coordinates from csv file to plot on the map. But just only one point on the map.
from pyecharts import Geo
import pandas as pd
df=pd.read_csv(r"C:\157.csv")
df.head()
geo_cities_coords={i:[df.iloc[i]["Lon"],df.iloc[i]["Lat"]] for i in range(len(df))}
print(geo_cities_coords)
attr=list(df["City"])
value=list(df["Days"]/100000)
geo = Geo("Title", "subtitle", title_color="#fff", title_pos="center",width=1200, height=600, background_color='#404a59')
geo.add("",attr,value,visual_range=[0,100],symbol_size= 5,
visual_text_color= "#fff",is_piecewise = True,
is_visualmap= True,maptype = '北京', visual_split_number= 10,
geo_cities_coords=geo_cities_coords)
geo.render( 'test.html')
CSV file format:
City,Lat,Lon,Days
1,39.97556667,116.33035,39201.35731
2,39.97545,116.3302333,39201.35903
3,39.97383333,116.3329667,39201.52389

Categories