Geopandas: different .sjoin() results with different projections systems

Geopandas: different .sjoin() results with different projections systems - python

I tried to run a spatial join between a list of assets and a river basin dataset that you can find at the link below
https://datasets.wri.org/dataset/aqueduct-global-flood-risk-maps?msclkid=630fc948b63611ec9931936b22cf4990
The first approach was a join on an ESPG 4326 projection setting and it works fine.
rfd = r"C:\Users\~\aqueduct_global_flood_risk_data_by_river_basin_20150304.shp"
wri_rfr = gpd.read_file(rfd, crs='epsg:4326')
test = ['Unit 1', 'Unit 2' ]
test_lat = ['0.176095', '-24.193790']
test_lon = ['117.495523', '150.370650']
df = pd.DataFrame()
df['Name'] = test
df['Latitude'] = test_lat
df['Longitude'] = test_lon
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']))
gdf = gdf.set_crs('epsg:4326')
joined = gpd.sjoin(gdf, wri_rfr, how='inner')
len(joined )
The two assets have both a join.
In a second approach, I try to create a 500 mt buffer around my assets using a meter-based projection system (3006) and proceed to merge them...but it returns no result?
rfd = r"C:\Users\~\aqueduct_global_flood_risk_data_by_river_basin_20150304.shp"
wri_rfr = gpd.read_file(rfd, crs='epsg:4326')
test = ['Unit 1', 'Unit 2' ]
test_lat = ['0.176095', '-24.193790']
test_lon = ['117.495523', '150.370650']
df = pd.DataFrame()
df['Name'] = test
df['Latitude'] = test_lat
df['Longitude'] = test_lon
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']))
gdf = gdf.set_crs('epsg:4326')
gdf = gdf.to_crs({'init': 'epsg:3006'})
gdf.geometry = gdf.geometry.buffer(500)
gdf = gdf.loc[gdf.is_valid]
wri_rfr_3006 = wri_rfr.to_crs({'init': 'epsg:3006'})
wri_rfr_3006 = wri_rfr_3006.loc[wri_rfr_3006.is_valid]
joined = gpd.sjoin(gdf, wri_rfr_3006 , how='inner')
len(joined )
it returns no joins.
What am I missing here? Why would be the results different?

have coded up data sourcing of shape files
take a look at documentation https://epsg.io/3006 this is for Sweden. Hence locations in Borneo and Australia are going to start to give rounding errors when expressed in meters from Sweden
have taken approach of work out UTM CRS of each point, buffer it, then convert back to epsg:4386
with buffered point geometry can now spatial join as an inappropriate CRS for global geometry has not been used
test = ["Unit 1", "Unit 2"]
test_lat = ["0.176095", "-24.193790"]
test_lon = ["117.495523", "150.370650"]
df = pd.DataFrame()
df["Name"] = test
df["Latitude"] = test_lat
df["Longitude"] = test_lon
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]))
gdf = gdf.set_crs("epsg:4326")
# work out UTM CRS for each point, then buffer it and return back as original CRS
def buffer_meter(g, crs="epsg:6666", buffer=50):
t = gpd.GeoDataFrame(geometry=[g], crs=crs)
return t.to_crs(t.estimate_utm_crs()).buffer(buffer).to_crs(crs).values[0]
# buffer the points
gdf["geometry"] = gdf["geometry"].apply(buffer_meter, crs=gdf.crs, buffer=500)
# now join
gpd.sjoin(gdf, wri_rfr, how='inner')
data sourcing
import requests
from pathlib import Path
from zipfile import ZipFile
import urllib
import geopandas as gpd
import pandas as pd
# download data sets
urls = [
"http://datasets.wri.org/dataset/c19396d9-45c8-4e92-bf05-d1411c9cc2ca/resource/498319f7-992a-4447-94b4-c62d8f1daa38/download/aqueductglobalfloodriskdatabycountry20150304.zip",
"http://datasets.wri.org/dataset/c19396d9-45c8-4e92-bf05-d1411c9cc2ca/resource/471ef133-939c-4ca6-9b1c-5f81b5251c2b/download/aqueductglobalfloodriskdatabyriverbasin20150304.zip",
"http://datasets.wri.org/dataset/c19396d9-45c8-4e92-bf05-d1411c9cc2ca/resource/dd90c26a-edf2-46e4-be22-4273ab2344d0/download/aqueductglobalfloodriskdatabystate20150304.zip",
]
dfs = {}
for url in urls:
f = Path.cwd().joinpath(urllib.parse.urlparse(url).path.split("/")[-1])
if not f.exists():
r = requests.get(url, stream=True, headers={"User-Agent": "XY"})
with open(f, "wb") as fd:
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
zfile = ZipFile(f)
zfile.extractall(f.stem)
dfs[f.stem] = gpd.read_file(list(f.parent.joinpath(f.stem).glob("*.shp"))[0])
wri_rfr = dfs["aqueductglobalfloodriskdatabyriverbasin20150304"]

Related

How to show Company CI icon useing MarkerCluster in Folium

from folium.plugins import MarkerCluster
import folium
from folium.features import CustomIcon
import pandas as pd
import folium.plugins as plug
df = pd.read_excel('store.xlsx')
df1 = df[['대리점명','Latitude','Longitude']]
df1.rename(columns= {'대리점명':'Store'}, inplace=True)
df1.head()
x = []
y = []
name = []
for i in range(len(df1['Latitude'])):
if df['Latitude'][i] == 0.0 or df['Longitude'][i] == 0.0:
pass
else:
name.append(df1['Store'][i])
x.append(df1['Latitude'][i])
y.append(df1['Longitude'][i])
print('store_sum: ',len(name))
print('x_sum: ',len(x))
print('y_sum: ',len(y))
import folium
import folium.plugins as plug
import json
map_store = folium.Map(location=[37.58, 127.0],zoom_start=11.5)
marker_cluster = plug.MarkerCluster().add_to(map_store)
file_name= r'seoul_municipalities_geo.json'
file_name = file_name.replace('\\','/')
with open(file_name, 'rt') as f:
geo = json.load(f)
f.close()
folium.GeoJson(geo, name='Store').add_to(map_store)
for i in range(len(x)):
folium.Marker([x[i],y[i]], popup= name[i], icon=folium.Icon(color='purple', icon='ok-circle')).add_to(marker_cluster)
map_store
sub_df = df
latitude = 37.58
longitude = 127.0
mm = folium.Map(
location=[latitude, longitude],
zoom_start=11.5
)
coords = sub_df[['Latitude', 'Longitude']]
marker_cluster = MarkerCluster().add_to(mm)
for lat, long in zip(coords['Latitude'], coords['Longitude']):
icon_path = r"hci.png"
icon = CustomIcon(
icon_image=icon_path,
icon_size=(180, 80),
icon_anchor=(50, 50),
)
marker = folium.Marker(location=[lat, long], icon=icon, popup="대리점명")
mm.add_child(marker)
mm
I tried many time But really I cant
And you can't understand my mean about bad English or can't understand my question
please look under url
https://towardsdatascience.com/visualizing-tesla-superchargers-in-france-8c10894ab3c
This is perfect example I want to mimic like this
Anyway in first image, change purple checkbox Marker to company CI(CI image is in second picuture)

The point is that each marker must be set to belong to a marker cluster. Latitude and longitude and store name are created randomly. Also, the logo is a stack overflow logo.
import numpy as np
import pandas as pd
import folium
from folium.plugins import MarkerCluster
from folium.features import CustomIcon
df = pd.DataFrame({'Store':name_list,
'Latitude': [random.uniform(36.0, 38.0) for _ in range(100)],
'Longitude': [random.uniform(126.0, 128.0) for _ in range(100)]})
df.head()
Store Latitude Longitude
0 ocUReOT56a 36.164013 127.045411
1 KbAQtbB5eG 36.534577 127.515191
2 PdiCOLvjC4 36.715178 126.333321
3 eM33oRtVii 37.889212 126.589194
4 nporJ7t4mY 36.604549 127.563762
latitude = 37.58
longitude = 127.0
mm = folium.Map(location=[latitude, longitude], zoom_start=10)
marker_cluster = MarkerCluster().add_to(mm)
for row in df.itertuples():
icon_path = r"./data/240px-Stack_Overflow_icon.svg.png"
icon = CustomIcon(icon_image=icon_path, icon_size=(50, 50))
folium.Marker(location=[row.Latitude, row.Longitude], icon=icon, popup=row.Store).add_to(marker_cluster)
mm

Creating new df from series of widget boxes

I have created an "input form" with several ipywidget boxes. I want to be able to reference all the values to create a new dataframe.
I'm currently doing this in a horrible way.
portfolio_df = pd.DataFrame([[VBox1.children[0].value, VBox2.children[0].value, VBox3.children[0].value, VBox4.children[0].value]],
columns=['Product Name','Units','Price', 'Invested Amount'])
row_2 = [VBox1.children[1].value, VBox2.children[1].value, VBox3.children[1].value, VBox4.children[21].value]
portfolio_df.loc[len(portfolio_df)] = row_2
row_3 = [VBox1.children[2].value, VBox2.children[2].value, VBox3.children[2].value, VBox4.children[2].value]
portfolio_df.loc[len(portfolio_df)] = row_3
row_4 = [VBox1.children[3].value, VBox2.children[3].value, VBox3.children[3].value, VBox4.children[3].value]
portfolio_df.loc[len(portfolio_df)] = row_4
and so on up till row 23 in this instance !! (but the length will vary up to the number of children within a VBox)
I suspect I can do this more pythonically using a for loop but cant figure it out.
Full code as per requests (I've edited columns so my live data is different but this is exact replica of the set up)
import pandas as pd
import numpy as np
import datetime as dt
import ipywidgets as ipw
from ipywidgets import *
barrier_list = pd.DataFrame(np.random.randn(24, 4), columns=('Product
Name','ISIN','A','B'))
barrier_list= barrier_list.astype(str)
dd_list = []
for i in range(len(barrier_list['Product Name'])):
dropdown = ipw.FloatText(description=barrier_list['ISIN'][i],
value=barrier_list['Product Name'][i],
disabled=False,
layout = {'width':'350px'})
dropdown.style.description_width = 'initial'
dd_list.append(dropdown)
dd_list1 = []
for i in range(len(barrier_list['Product Name'])):
dropdown1 = ipw.FloatText(description='Units',
value=0,
layout = {'width':'200px'})
dd_list1.append(dropdown1)
dd_list2 = []
for i in range(len(barrier_list['Product Name'])):
dropdown2 = ipw.FloatText(description='Price',
value=0,
layout = {'width':'200px'})
dd_list2.append(dropdown2)
dd_list3 = []
for i in range(len(barrier_list['Product Name'])):
dropdown3 = ipw.FloatText(description='Value',
value=0,
layout = {'width':'200px'})
dd_list3.append(dropdown3)
VBox1 = ipw.VBox(dd_list)
VBox2 = ipw.VBox(dd_list1)
VBox3 = ipw.VBox(dd_list2)
VBox4 = ipw.VBox(dd_list3)
HBox = widgets.HBox([VBox1, VBox2, VBox3, VBox4])

solved this one by looping through the VBoxes one by one and then concatenating the dataframes into one main one.
product_df = pd.DataFrame()
for i in range(len(dd_list)):
product_name_df = pd.DataFrame([[VBox1.children[i].value]],columns=
['Product Name'])
product_df = product_df.append(product_name_df)
unit_df = pd.DataFrame()
for i in range(len(dd_list)):
unit_amount_df = pd.DataFrame([[VBox2.children[i].value]],columns=
['Units'])
unit_df = unit_df.append(unit_amount_df)
price_df = pd.DataFrame()
for i in range(len(dd_list)):
price_amount_df = pd.DataFrame([[VBox3.children[i].value]],columns=
['Price'])
price_df = price_df.append(price_amount_df)
value_df = pd.DataFrame()
for i in range(len(dd_list)):
value_amount_df = pd.DataFrame([[VBox4.children[i].value]],columns=
['Value'])
value_df = value_df.append(value_amount_df)
df_list = [product_df.reset_index(drop=True),unit_df.reset_index(drop=True),
price_df.reset_ind ex(drop=True),value_df.reset_index(drop=True)]
portfolio_df = pd.concat((df_list), axis=1)
portfolio_df

Retrieving data from the Air Quality Index (AQI) website through the API and only recieving small nr. of stations

I'm working on a personal project and I'm trying to retrieve air quality data from the https://aqicn.org website using their API.
I've used this code, which I've copied and adapted for the city of Bucharest as follows:
import pandas as pd
import folium
import requests
# GET data from AQI website through the API
base_url = "https://api.waqi.info"
path_to_file = "~/path"
# Got token from:- https://aqicn.org/data-platform/token/#/
with open(path_to_file) as f:
contents = f.readlines()
key = contents[0]
# (lat, long)-> bottom left, (lat, lon)-> top right
latlngbox = "44.300264,25.920181,44.566991,26.297836" # For Bucharest
trail_url=f"/map/bounds/?token={key}&latlng={latlngbox}" #
my_data = pd.read_json(base_url + trail_url) # Joined parts of URL
print('columns->', my_data.columns) #2 cols ‘status’ and ‘data’ JSON
### Built a dataframe from the json file
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],
each_row['lat'],
each_row['lon'],
each_row['aqi']])
df = pd.DataFrame(all_rows, columns=['station_name', 'lat', 'lon', 'aqi'])
# Cleaned the DataFrame
df['aqi'] = pd.to_numeric(df.aqi, errors='coerce') # Invalid parsing to NaN
# Remove NaN entries in col
df1 = df.dropna(subset = ['aqi'])
Unfortunately it only retrieves 4 stations whereas there are many more available on the actual site. In the API documentation the only limitation I saw was for "1,000 (one thousand) requests per second" so why can't I get more of them?
Also, I've tried to modify the lat-long values and managed to get more stations, but they were outside the city I was interested in.
Here is a view of the actual perimeter I've used in the embedded code.
If you have any suggestions as of how I can solve this issue, I'd be very happy to read your thoughts. Thank you!

Try using waqi through aqicn... not exactly a clean API but I found it to work quite well
import pandas as pd
url1 = 'https://api.waqi.info'
# Get token from:- https://aqicn.org/data-platform/token/#/
token = 'XXX'
box = '113.805332,22.148942,114.434299,22.561716' # polygon around HongKong via bboxfinder.com
url2=f'/map/bounds/?latlng={box}&token={token}'
my_data = pd.read_json(url1 + url2)
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],each_row['lat'],each_row['lon'],each_row['aqi']])
df = pd.DataFrame(all_rows,columns=['station_name', 'lat', 'lon', 'aqi'])
From there its easy to plot
df['aqi'] = pd.to_numeric(df.aqi,errors='coerce')
print('with NaN->', df.shape)
df1 = df.dropna(subset = ['aqi'])
df2 = df1[['lat', 'lon', 'aqi']]
init_loc = [22.396428, 114.109497]
max_aqi = int(df1['aqi'].max())
print('max_aqi->', max_aqi)
m = folium.Map(location = init_loc, zoom_start = 5)
heat_aqi = HeatMap(df2, min_opacity = 0.1, max_val = max_aqi,
radius = 60, blur = 20, max_zoom = 2)
m.add_child(heat_aqi)
m
Or as such
centre_point = [22.396428, 114.109497]
m2 = folium.Map(location = centre_point,tiles = 'Stamen Terrain', zoom_start= 6)
for idx, row in df1.iterrows():
lat = row['lat']
lon = row['lon']
station = row['station_name'] + ' AQI=' + str(row['aqi'])
station_aqi = row['aqi']
if station_aqi > 300:
pop_color = 'red'
elif station_aqi > 200:
pop_color = 'orange'
else:
pop_color = 'green'
folium.Marker(location= [lat, lon],
popup = station,
icon = folium.Icon(color = pop_color)).add_to(m2)
m2
checking for stations within HK, returns 19
df[df['station_name'].str.contains('HongKong')]

How to convert a CSV table into COCO format in python?

I have a CSV table with the following columns:
column_names = ['image_id', 'xmin', 'ymin', 'width', 'height', 'xmax','ymax']
where xmin, ymin, xmax and ymax represent the bounding box that encloses some object; width and height, the image dimensions; and image_id, the file name (.JPG file). Since I want to do object detection, I need to convert this table into COCO format. Amazingly enough, I can't find any answer to this inquiry in the internet.

I had the same issue before, then I found this code it is very helpful
you will need to change the column names to this columns and update the csv file
column_names =['filename','class','width', 'height','xmin','ymin','xmax','ymax']
then try this code
import numpy as np
import json
import pandas as pd
path = 'annotations.csv' # the path to the CSV file
save_json_path = 'traincoco.json'
data = pd.read_csv(path)
images = []
categories = []
annotations = []
category = {}
category["supercategory"] = 'none'
category["id"] = 0
category["name"] = 'None'
categories.append(category)
data['fileid'] = data['filename'].astype('category').cat.codes
data['categoryid']= pd.Categorical(data['class'],ordered= True).codes
data['categoryid'] = data['categoryid']+1
data['annid'] = data.index
def image(row):
image = {}
image["height"] = row.height
image["width"] = row.width
image["id"] = row.fileid
image["file_name"] = row.filename
return image
def category(row):
category = {}
category["supercategory"] = 'None'
category["id"] = row.categoryid
category["name"] = row[2]
return category
def annotation(row):
annotation = {}
area = (row.xmax -row.xmin)*(row.ymax - row.ymin)
annotation["segmentation"] = []
annotation["iscrowd"] = 0
annotation["area"] = area
annotation["image_id"] = row.fileid
annotation["bbox"] = [row.xmin, row.ymin, row.xmax -row.xmin,row.ymax-row.ymin ]
annotation["category_id"] = row.categoryid
annotation["id"] = row.annid
return annotation
for row in data.itertuples():
annotations.append(annotation(row))
imagedf = data.drop_duplicates(subset=['fileid']).sort_values(by='fileid')
for row in imagedf.itertuples():
images.append(image(row))
catdf = data.drop_duplicates(subset=['categoryid']).sort_values(by='categoryid')
for row in catdf.itertuples():
categories.append(category(row))
data_coco = {}
data_coco["images"] = images
data_coco["categories"] = categories
data_coco["annotations"] = annotations
json.dump(data_coco, open(save_json_path, "w"), indent=4)

Read multiple csv files and write multiple netCDF files

I have the following Python code works perfectly fine for a single .csv file to convert for a netCDF file.
But, I have multiple files (365), as, 'TRMM_1998_01_02_newntcl.csv', 'TRMM_1998_01_03_newntcl.csv'....upto 'TRMM_1998_12_31_newntcl.csv'.
Can somebody help me to write to loop through all the csv files and create 365 netCDF files using this code.?
Anyhelp is appreciated.
Thanks in advance.
import numpy as np
def convert_file(filename):
data = np.loadtxt(fname=filename, delimiter=',')
# filename = "TRMM_{}_{}_{}_newntcl.csv".format(d.year,d.month,d.day)
Lat_data = np.loadtxt('Latitude.csv', delimiter=',')
Lon_data = np.loadtxt('Longitude.csv', delimiter=',')
# create a netcdf Data object
with netCDF4.Dataset('TEST_file.nc', mode="w", format='NETCDF4') as ds:
# some file-level meta-data attributes:
ds.Conventions = "CF-1.6"
ds.title = 'precipitation'
ds.institution = 'Institute'
ds.author = 'Author'
lat_arr = data[:,0] # the first column
lon_arr = data[:,1] # the second column
precip_arr = data[:,2] # the third column
nlat = lat_arr.reshape( (161, 321) )
nlon = lon_arr.reshape( (161, 321) )
# ds.createDimension('time', 0)
ds.createDimension('latitude', 161)
ds.createDimension('longitude', 321)
precip = ds.createVariable('precip', 'f4', ('latitude', 'longitude'))
precip[:] = data[:,2]
## adds some attributes
precip.units = 'mm'
precip.long_name = 'Precipitation'
lat = ds.createVariable('lat', 'f4', ('latitude'))
lat[:] = Lat_data[:]
## adds some attributes
lat.units = 'degrees_South'
lat.long_name = 'Latitude'
lon = ds.createVariable('lon', 'f4', ('longitude'))
lon[:] = Lon_data[:]
## adds some attributes
lon.units = 'degrees_East'
lon.long_name = 'Longitude'
print ds
# print filename
# load the data
path='C:\Users\.spyder2'
os.chdir(path)
d=datetime.date(1998,01,01)
while d.year==1998:
d+=datetime.timedelta(days=1)
convert_file("TRMM_{}_{}_{}_newntcl.csv".format(d.year,d.month,d.day))

It looks like you can use a datetime.date object to loop through all of the days in a year. First, you should put the code you have in a function that takes a filename. Then, you can just make a date object and call the function in a loop:
import datetime
d=datetime.date(1998,1,1)
while d.year==1998:
d+=datetime.timedelta(days=1)
convert_file("TRMM_{}_{}_{}_newntcl.csv".format(d.year,d.month,d.day))

If I read your question correctly, there is an easier way of using os in this case. You can just take in the file names and use them in a loop:
import os
main_fp = "C:\\Users\\spyder2"
path, dirs, files = os.walk(main_fp).next()
for f_path in files:
data = np.loadtxt(f_path, delimiter=',')
Lat_data = np.loadtxt('Latitude.csv', delimiter=',') #put lat and long csv's in separate folder, so you don't read them into the loop
Lon_data = np.loadtxt('Longitude.csv', delimiter=',')
#strip csv extentions
new_fname = f_path.strip('.csv')
with netCDF4.Dataset(new_fname+'.nc', mode="w", format='NETCDF4') as ds:
# some file-level meta-data attributes:
ds.Conventions = "CF-1.6"
ds.title = 'Non TC precipitation'
ds.institution = 'AIR-Worldwide'
ds.author = 'Dr. Dumindu Jayasekera'
lat_arr = data[:,0] # the first column
lon_arr = data[:,1] # the second column
precip_arr = data[:,2] # the third column
nlat = lat_arr.reshape( (161, 321) )
nlon = lon_arr.reshape( (161, 321) )
ds.createDimension('latitude', 161)
ds.createDimension('longitude', 321)
precip = ds.createVariable('precip', 'f4', ('latitude', 'longitude'))
precip[:] = data[:,2]
## adds some attributes
precip.units = 'mm'
precip.long_name = 'Precipitation'
lat = ds.createVariable('lat', 'f4', ('latitude'))
lat[:] = Lat_data[:]
## adds some attributes
lat.units = 'degrees_South'
lat.long_name = 'Latitude'
lon = ds.createVariable('lon', 'f4', ('longitude'))
lon[:] = Lon_data[:]
## adds some attributes
lon.units = 'degrees_East'
lon.long_name = 'Longitude'
print ds

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Geopandas: different .sjoin() results with different projections systems - python

Related

How to show Company CI icon useing MarkerCluster in Folium

Creating new df from series of widget boxes

Retrieving data from the Air Quality Index (AQI) website through the API and only recieving small nr. of stations

How to convert a CSV table into COCO format in python?

Read multiple csv files and write multiple netCDF files

Categories

Resources