Read multiple csv files and write multiple netCDF files - python

I have the following Python code works perfectly fine for a single .csv file to convert for a netCDF file.
But, I have multiple files (365), as, 'TRMM_1998_01_02_newntcl.csv', 'TRMM_1998_01_03_newntcl.csv'....upto 'TRMM_1998_12_31_newntcl.csv'.
Can somebody help me to write to loop through all the csv files and create 365 netCDF files using this code.?
Anyhelp is appreciated.
Thanks in advance.
import numpy as np
def convert_file(filename):
data = np.loadtxt(fname=filename, delimiter=',')
# filename = "TRMM_{}_{}_{}_newntcl.csv".format(d.year,d.month,d.day)
Lat_data = np.loadtxt('Latitude.csv', delimiter=',')
Lon_data = np.loadtxt('Longitude.csv', delimiter=',')
# create a netcdf Data object
with netCDF4.Dataset('TEST_file.nc', mode="w", format='NETCDF4') as ds:
# some file-level meta-data attributes:
ds.Conventions = "CF-1.6"
ds.title = 'precipitation'
ds.institution = 'Institute'
ds.author = 'Author'
lat_arr = data[:,0] # the first column
lon_arr = data[:,1] # the second column
precip_arr = data[:,2] # the third column
nlat = lat_arr.reshape( (161, 321) )
nlon = lon_arr.reshape( (161, 321) )
# ds.createDimension('time', 0)
ds.createDimension('latitude', 161)
ds.createDimension('longitude', 321)
precip = ds.createVariable('precip', 'f4', ('latitude', 'longitude'))
precip[:] = data[:,2]
## adds some attributes
precip.units = 'mm'
precip.long_name = 'Precipitation'
lat = ds.createVariable('lat', 'f4', ('latitude'))
lat[:] = Lat_data[:]
## adds some attributes
lat.units = 'degrees_South'
lat.long_name = 'Latitude'
lon = ds.createVariable('lon', 'f4', ('longitude'))
lon[:] = Lon_data[:]
## adds some attributes
lon.units = 'degrees_East'
lon.long_name = 'Longitude'
print ds
# print filename
# load the data
path='C:\Users\.spyder2'
os.chdir(path)
d=datetime.date(1998,01,01)
while d.year==1998:
d+=datetime.timedelta(days=1)
convert_file("TRMM_{}_{}_{}_newntcl.csv".format(d.year,d.month,d.day))

It looks like you can use a datetime.date object to loop through all of the days in a year. First, you should put the code you have in a function that takes a filename. Then, you can just make a date object and call the function in a loop:
import datetime
d=datetime.date(1998,1,1)
while d.year==1998:
d+=datetime.timedelta(days=1)
convert_file("TRMM_{}_{}_{}_newntcl.csv".format(d.year,d.month,d.day))

If I read your question correctly, there is an easier way of using os in this case. You can just take in the file names and use them in a loop:
import os
main_fp = "C:\\Users\\spyder2"
path, dirs, files = os.walk(main_fp).next()
for f_path in files:
data = np.loadtxt(f_path, delimiter=',')
Lat_data = np.loadtxt('Latitude.csv', delimiter=',') #put lat and long csv's in separate folder, so you don't read them into the loop
Lon_data = np.loadtxt('Longitude.csv', delimiter=',')
#strip csv extentions
new_fname = f_path.strip('.csv')
with netCDF4.Dataset(new_fname+'.nc', mode="w", format='NETCDF4') as ds:
# some file-level meta-data attributes:
ds.Conventions = "CF-1.6"
ds.title = 'Non TC precipitation'
ds.institution = 'AIR-Worldwide'
ds.author = 'Dr. Dumindu Jayasekera'
lat_arr = data[:,0] # the first column
lon_arr = data[:,1] # the second column
precip_arr = data[:,2] # the third column
nlat = lat_arr.reshape( (161, 321) )
nlon = lon_arr.reshape( (161, 321) )
ds.createDimension('latitude', 161)
ds.createDimension('longitude', 321)
precip = ds.createVariable('precip', 'f4', ('latitude', 'longitude'))
precip[:] = data[:,2]
## adds some attributes
precip.units = 'mm'
precip.long_name = 'Precipitation'
lat = ds.createVariable('lat', 'f4', ('latitude'))
lat[:] = Lat_data[:]
## adds some attributes
lat.units = 'degrees_South'
lat.long_name = 'Latitude'
lon = ds.createVariable('lon', 'f4', ('longitude'))
lon[:] = Lon_data[:]
## adds some attributes
lon.units = 'degrees_East'
lon.long_name = 'Longitude'
print ds

Related

How to save the results of a function as a new CSV?

The code is required to take addresses from a csv file and then use a function to compute the corresponding Latitudes and Longitudes. While I get the correct Latitudes and Longitudes but I am unable to save them to a new csv file.
import requests
import urllib.parse
import pandas as pd
#function to get the Coordinates:
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
return
#function is called to get the 5 Address Values from the CSV File and pass on to the function
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
lat_long(add)
Output is:
Latitude Longitude
34.0096961 71.8990106
34.0123846 71.5787458
33.6038766 73.048136
33.6938118 73.0651511
24.8546842 67.0207055
I want to save this output into a new file and I am unable to get the results.
Just a small modification might help
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
Lat = response[0]["lat"]
Long = response[0]["lon"]
return Lat, Long
Lat_List = []
Long_List = []
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
Lat =lat_long(add)[0]
Long = lat_long(add)[1]
Lat_List.append(Lat)
Long_List.append(Long)
df1 = pd.DataFrame(data, columns=['Latitude', 'Longitude])
df1['Latitude'] = Lat_List
df1['Longitude'] = Long_List
df1.to_csv("LatLong.csv)
#one line of change here
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
return response[0]["lat"], response[0]["lon"] # return the lat and long
# three lines added here
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
l=[] # define empty list
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
l.append(lat_long(add)) # append to the empty l
# create a dataframe and output as csv
pd.DataFrame(l, columns=['Longitude', 'Latitude']).to_csv('test.csv', sep= ' ')

Geopandas: different .sjoin() results with different projections systems

I tried to run a spatial join between a list of assets and a river basin dataset that you can find at the link below
https://datasets.wri.org/dataset/aqueduct-global-flood-risk-maps?msclkid=630fc948b63611ec9931936b22cf4990
The first approach was a join on an ESPG 4326 projection setting and it works fine.
rfd = r"C:\Users\~\aqueduct_global_flood_risk_data_by_river_basin_20150304.shp"
wri_rfr = gpd.read_file(rfd, crs='epsg:4326')
test = ['Unit 1', 'Unit 2' ]
test_lat = ['0.176095', '-24.193790']
test_lon = ['117.495523', '150.370650']
df = pd.DataFrame()
df['Name'] = test
df['Latitude'] = test_lat
df['Longitude'] = test_lon
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']))
gdf = gdf.set_crs('epsg:4326')
joined = gpd.sjoin(gdf, wri_rfr, how='inner')
len(joined )
The two assets have both a join.
In a second approach, I try to create a 500 mt buffer around my assets using a meter-based projection system (3006) and proceed to merge them...but it returns no result?
rfd = r"C:\Users\~\aqueduct_global_flood_risk_data_by_river_basin_20150304.shp"
wri_rfr = gpd.read_file(rfd, crs='epsg:4326')
test = ['Unit 1', 'Unit 2' ]
test_lat = ['0.176095', '-24.193790']
test_lon = ['117.495523', '150.370650']
df = pd.DataFrame()
df['Name'] = test
df['Latitude'] = test_lat
df['Longitude'] = test_lon
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']))
gdf = gdf.set_crs('epsg:4326')
gdf = gdf.to_crs({'init': 'epsg:3006'})
gdf.geometry = gdf.geometry.buffer(500)
gdf = gdf.loc[gdf.is_valid]
wri_rfr_3006 = wri_rfr.to_crs({'init': 'epsg:3006'})
wri_rfr_3006 = wri_rfr_3006.loc[wri_rfr_3006.is_valid]
joined = gpd.sjoin(gdf, wri_rfr_3006 , how='inner')
len(joined )
it returns no joins.
What am I missing here? Why would be the results different?
have coded up data sourcing of shape files
take a look at documentation https://epsg.io/3006 this is for Sweden. Hence locations in Borneo and Australia are going to start to give rounding errors when expressed in meters from Sweden
have taken approach of work out UTM CRS of each point, buffer it, then convert back to epsg:4386
with buffered point geometry can now spatial join as an inappropriate CRS for global geometry has not been used
test = ["Unit 1", "Unit 2"]
test_lat = ["0.176095", "-24.193790"]
test_lon = ["117.495523", "150.370650"]
df = pd.DataFrame()
df["Name"] = test
df["Latitude"] = test_lat
df["Longitude"] = test_lon
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]))
gdf = gdf.set_crs("epsg:4326")
# work out UTM CRS for each point, then buffer it and return back as original CRS
def buffer_meter(g, crs="epsg:6666", buffer=50):
t = gpd.GeoDataFrame(geometry=[g], crs=crs)
return t.to_crs(t.estimate_utm_crs()).buffer(buffer).to_crs(crs).values[0]
# buffer the points
gdf["geometry"] = gdf["geometry"].apply(buffer_meter, crs=gdf.crs, buffer=500)
# now join
gpd.sjoin(gdf, wri_rfr, how='inner')
data sourcing
import requests
from pathlib import Path
from zipfile import ZipFile
import urllib
import geopandas as gpd
import pandas as pd
# download data sets
urls = [
"http://datasets.wri.org/dataset/c19396d9-45c8-4e92-bf05-d1411c9cc2ca/resource/498319f7-992a-4447-94b4-c62d8f1daa38/download/aqueductglobalfloodriskdatabycountry20150304.zip",
"http://datasets.wri.org/dataset/c19396d9-45c8-4e92-bf05-d1411c9cc2ca/resource/471ef133-939c-4ca6-9b1c-5f81b5251c2b/download/aqueductglobalfloodriskdatabyriverbasin20150304.zip",
"http://datasets.wri.org/dataset/c19396d9-45c8-4e92-bf05-d1411c9cc2ca/resource/dd90c26a-edf2-46e4-be22-4273ab2344d0/download/aqueductglobalfloodriskdatabystate20150304.zip",
]
dfs = {}
for url in urls:
f = Path.cwd().joinpath(urllib.parse.urlparse(url).path.split("/")[-1])
if not f.exists():
r = requests.get(url, stream=True, headers={"User-Agent": "XY"})
with open(f, "wb") as fd:
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
zfile = ZipFile(f)
zfile.extractall(f.stem)
dfs[f.stem] = gpd.read_file(list(f.parent.joinpath(f.stem).glob("*.shp"))[0])
wri_rfr = dfs["aqueductglobalfloodriskdatabyriverbasin20150304"]

Retrieving data from the Air Quality Index (AQI) website through the API and only recieving small nr. of stations

I'm working on a personal project and I'm trying to retrieve air quality data from the https://aqicn.org website using their API.
I've used this code, which I've copied and adapted for the city of Bucharest as follows:
import pandas as pd
import folium
import requests
# GET data from AQI website through the API
base_url = "https://api.waqi.info"
path_to_file = "~/path"
# Got token from:- https://aqicn.org/data-platform/token/#/
with open(path_to_file) as f:
contents = f.readlines()
key = contents[0]
# (lat, long)-> bottom left, (lat, lon)-> top right
latlngbox = "44.300264,25.920181,44.566991,26.297836" # For Bucharest
trail_url=f"/map/bounds/?token={key}&latlng={latlngbox}" #
my_data = pd.read_json(base_url + trail_url) # Joined parts of URL
print('columns->', my_data.columns) #2 cols ‘status’ and ‘data’ JSON
### Built a dataframe from the json file
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],
each_row['lat'],
each_row['lon'],
each_row['aqi']])
df = pd.DataFrame(all_rows, columns=['station_name', 'lat', 'lon', 'aqi'])
# Cleaned the DataFrame
df['aqi'] = pd.to_numeric(df.aqi, errors='coerce') # Invalid parsing to NaN
# Remove NaN entries in col
df1 = df.dropna(subset = ['aqi'])
Unfortunately it only retrieves 4 stations whereas there are many more available on the actual site. In the API documentation the only limitation I saw was for "1,000 (one thousand) requests per second" so why can't I get more of them?
Also, I've tried to modify the lat-long values and managed to get more stations, but they were outside the city I was interested in.
Here is a view of the actual perimeter I've used in the embedded code.
If you have any suggestions as of how I can solve this issue, I'd be very happy to read your thoughts. Thank you!
Try using waqi through aqicn... not exactly a clean API but I found it to work quite well
import pandas as pd
url1 = 'https://api.waqi.info'
# Get token from:- https://aqicn.org/data-platform/token/#/
token = 'XXX'
box = '113.805332,22.148942,114.434299,22.561716' # polygon around HongKong via bboxfinder.com
url2=f'/map/bounds/?latlng={box}&token={token}'
my_data = pd.read_json(url1 + url2)
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],each_row['lat'],each_row['lon'],each_row['aqi']])
df = pd.DataFrame(all_rows,columns=['station_name', 'lat', 'lon', 'aqi'])
From there its easy to plot
df['aqi'] = pd.to_numeric(df.aqi,errors='coerce')
print('with NaN->', df.shape)
df1 = df.dropna(subset = ['aqi'])
df2 = df1[['lat', 'lon', 'aqi']]
init_loc = [22.396428, 114.109497]
max_aqi = int(df1['aqi'].max())
print('max_aqi->', max_aqi)
m = folium.Map(location = init_loc, zoom_start = 5)
heat_aqi = HeatMap(df2, min_opacity = 0.1, max_val = max_aqi,
radius = 60, blur = 20, max_zoom = 2)
m.add_child(heat_aqi)
m
Or as such
centre_point = [22.396428, 114.109497]
m2 = folium.Map(location = centre_point,tiles = 'Stamen Terrain', zoom_start= 6)
for idx, row in df1.iterrows():
lat = row['lat']
lon = row['lon']
station = row['station_name'] + ' AQI=' + str(row['aqi'])
station_aqi = row['aqi']
if station_aqi > 300:
pop_color = 'red'
elif station_aqi > 200:
pop_color = 'orange'
else:
pop_color = 'green'
folium.Marker(location= [lat, lon],
popup = station,
icon = folium.Icon(color = pop_color)).add_to(m2)
m2
checking for stations within HK, returns 19
df[df['station_name'].str.contains('HongKong')]

How to convert a CSV table into COCO format in python?

I have a CSV table with the following columns:
column_names = ['image_id', 'xmin', 'ymin', 'width', 'height', 'xmax','ymax']
where xmin, ymin, xmax and ymax represent the bounding box that encloses some object; width and height, the image dimensions; and image_id, the file name (.JPG file). Since I want to do object detection, I need to convert this table into COCO format. Amazingly enough, I can't find any answer to this inquiry in the internet.
I had the same issue before, then I found this code it is very helpful
you will need to change the column names to this columns and update the csv file
column_names =['filename','class','width', 'height','xmin','ymin','xmax','ymax']
then try this code
import numpy as np
import json
import pandas as pd
path = 'annotations.csv' # the path to the CSV file
save_json_path = 'traincoco.json'
data = pd.read_csv(path)
images = []
categories = []
annotations = []
category = {}
category["supercategory"] = 'none'
category["id"] = 0
category["name"] = 'None'
categories.append(category)
data['fileid'] = data['filename'].astype('category').cat.codes
data['categoryid']= pd.Categorical(data['class'],ordered= True).codes
data['categoryid'] = data['categoryid']+1
data['annid'] = data.index
def image(row):
image = {}
image["height"] = row.height
image["width"] = row.width
image["id"] = row.fileid
image["file_name"] = row.filename
return image
def category(row):
category = {}
category["supercategory"] = 'None'
category["id"] = row.categoryid
category["name"] = row[2]
return category
def annotation(row):
annotation = {}
area = (row.xmax -row.xmin)*(row.ymax - row.ymin)
annotation["segmentation"] = []
annotation["iscrowd"] = 0
annotation["area"] = area
annotation["image_id"] = row.fileid
annotation["bbox"] = [row.xmin, row.ymin, row.xmax -row.xmin,row.ymax-row.ymin ]
annotation["category_id"] = row.categoryid
annotation["id"] = row.annid
return annotation
for row in data.itertuples():
annotations.append(annotation(row))
imagedf = data.drop_duplicates(subset=['fileid']).sort_values(by='fileid')
for row in imagedf.itertuples():
images.append(image(row))
catdf = data.drop_duplicates(subset=['categoryid']).sort_values(by='categoryid')
for row in catdf.itertuples():
categories.append(category(row))
data_coco = {}
data_coco["images"] = images
data_coco["categories"] = categories
data_coco["annotations"] = annotations
json.dump(data_coco, open(save_json_path, "w"), indent=4)

Error while trying to get dictionary to pandas dataframe, python

I got "Pandas ValueError Arrays Must be All Same Length"
Before I start, I checked answers to similar problems and folks suggest to use something like:
DataFrame(dict([ (k,Series(v)) for k,v in d.iteritems() ]))
if you have only two values in dictionary or,
a = {'Links' : lines ,'Titles' : titles , 'Singers': finalsingers , 'Albums':finalalbums , 'Years' : years}
df = pd.DataFrame.from_dict(a, orient='index')
df.transpose()
But neither of them worked for me. What my code does is, goes to file in directory, captures the name and last_modified time, opens the file and use it in function called phash and returns a value. I think there could be a problem with phash function, maybe sometimes it returns a null value.
So in my case data is something like this:
raw_data = {}
hash_11 = []
time_1 = []
file_name_1 = []
for file in date_file_list:
try:
#print(file[1])
y = file[1]
file_name = os.path.basename(y) # extract just the filename or #file_name = os.path.split(file[1])
file_name = file_name.split('_-_')[0]
file_name_1.append(file_name)
#print(file_name)
# convert date tuple to MM/DD/YYYY HH:MM:SS format
#file_date = time.strftime("%m/%d/%y %H:%M:%S", file[0])
time = file[0]
time_1.append(time)
img = Image.open(str(file[1]))
hash_1 = imagehash.dhash(img)
hash_11.append(hash_1)
#hash_1 = str(hash_1)
#data = [hash_1, time, file_name]
#hamming_list.append(data)
#print(file_name, hash_1, file_date)
data ={'hash_1': hash_11,'time': time_1, 'file_name': file_name_1}
raw_data.update(data)
except:
pass
df = pd.DataFrame(raw_data, columns = ['hash_1', 'time','file_name'])

Categories