Reverse Array in a dataframe

Reverse Array in a dataframe - python

Hi I am trying to extract data from a netCDF file, but the data is upside down. How can I reverse the database:
The data I want to extract is the height data from the (netcdf) at the points I have in the CSV file. my Data:
import numpy as np
from netCDF4 import Dataset
import matplotlib.pyplot as plt
import pandas as pd
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Path, PathPatch
csv_data = np.loadtxt('CSV with target coordinates',skiprows=1,delimiter=',')
num_el = csv_data[:,0]
lat = csv_data[:,1]
lon = csv_data[:,2]
value = csv_data[:,3]
data = Dataset("elevation Data",'r')
lon_range = data.variables['x_range'][:]
lat_range = data.variables['y_range'][:]
topo_range = data.variables['z_range'][:]
spacing = data.variables['spacing'][:]
dimension = data.variables['dimension'][:]
z = data.variables['z'][:]
lon_num = dimension[0]
lat_num = dimension[1]
etopo_lon = np.linspace(lon_range[0],lon_range[1],dimension[0])
etopo_lat = np.linspace(lat_range[0],lat_range[1],dimension[1])
topo = np.reshape(z, (lat_num, lon_num))
height = np.empty_like(num_el)
desired_lat_idx = np.empty_like(num_el)
desired_lon_idx = np.empty_like(num_el)
for i in range(len(num_el)):
tmp_lat = np.abs(etopo_lat - lat[i]).argmin()
tmp_lon = np.abs(etopo_lon - lon[i]).argmin()
desired_lat_idx[i] = tmp_lat
desired_lon_idx[i] = tmp_lon
height[i] = topo[tmp_lat,tmp_lon]
height[height<-10]=0
print(len(desired_lat_idx))
print(len(desired_lon_idx))
print(len(height))
dfl= pd.DataFrame({
'Latitude' : lat.reshape(-1),
'Longitude': lon.reshape(-1),
'Altitude': height.reshape(-1)
});
print(dfl)
# but the Lat should not be changed here (the dfl must be correct)
df =dfl
lat=np.array(df['Latitude'])
lon=np.array(df['Longitude'])
val=np.array(df['Altitude'])
m = basemap.Basemap(projection='robin', lon_0=0, lat_0=0, resolution='l',area_thresh=1000)
m.drawcoastlines(color = 'black')
x,y = m(lon,lat)
colormesh= m.contourf(x,y,val,100, tri=True, cmap = 'terrain')
plt.colorbar(location='bottom',pad=0.04,fraction=0.06)
plt.show()
I have already tried:
lat = csv_data[:,1]
lat= lat*(-1)
But this didn´t work

It's a plotting artifact().
Just do:
colormesh= m.contourf(x,y[::-1],val,100, tri=True, cmap = 'terrain')
y[::-1] will reverse the order of the y latitude elements (as opposed to the land-mass outlines; and while keeping the x longitude coordinates the same) and hence flip them.
I've often had this problem with plotting numpy image data in the past.
Your raw CSV data are unlikely to be flipped themselves (why would they be?). You should try sanity-checking them [I am not a domain expert I'm afraid]! Overlaying an actual coordinate grid can help with this.
Another way to do it is given here: Reverse Y-Axis in PyPlot
You could also therefore just do
ax = plt.gca()
ax.invert_yaxis()

Related

boxplot structure disappears when pandas contains nan [duplicate]

I am using matplotlib to plot a box figure but there are some missing values (NaN). Then I found it doesn't display the box figure within the columns having NaN values.
Do you know how to solve this problem?
Here are the codes.
import numpy as np
import matplotlib.pyplot as plt
#==============================================================================
# open data
#==============================================================================
filename='C:\\Users\\liren\\OneDrive\\Data\\DATA in the first field-final\\ks.csv'
AllData=np.genfromtxt(filename,delimiter=";",skip_header=0,dtype='str')
TreatmentCode = AllData[1:,0]
RepCode = AllData[1:,1]
KsData= AllData[1:,2:].astype('float')
DepthHeader = AllData[0,2:].astype('float')
TreatmentUnique = np.unique(TreatmentCode)[[3,1,4,2,8,6,9,7,0,5,10],]
nT = TreatmentUnique.size#nT=number of treatments
#nD=number of deepth;nR=numbers of replications;nT=number of treatments;iT=iterms of treatments
nD = 5
nR = 6
KsData_3D = np.zeros((nT,nD,nR))
for iT in range(nT):
Treatment = TreatmentUnique[iT]
TreatmentFilter = TreatmentCode == Treatment
KsData_Filtered = KsData[TreatmentFilter,:]
KsData_3D[iT,:,:] = KsData_Filtered.transpose()iD = 4
fig=plt.figure()
ax = fig.add_subplot(111)
plt.boxplot(KsData_3D[:,iD,:].transpose())
ax.set_xticks(range(1,nT+1))
ax.set_xticklabels(TreatmentUnique)
ax.set_title(DepthHeader[iD])
Here is the final figure and some of the treatments are missing in the box.

You can remove the NaNs from the data first, then plot the filtered data.
To do that, you can first find the NaNs using np.isnan(data), then perform the bitwise inversion of that Boolean array using the ~: bitwise inversion operator. Use that to index the data array, and you filter out the NaNs.
filtered_data = data[~np.isnan(data)]
In a complete example (adapted from here)
Tested in python 3.10, matplotlib 3.5.1, seaborn 0.11.2, numpy 1.21.5, pandas 1.4.2
For 1D data:
import matplotlib.pyplot as plt
import numpy as np
# fake up some data
np.random.seed(2022) # so the same data is created each time
spread = np.random.rand(50) * 100
center = np.ones(25) * 50
flier_high = np.random.rand(10) * 100 + 100
flier_low = np.random.rand(10) * -100
data = np.concatenate((spread, center, flier_high, flier_low), 0)
# Add a NaN
data[40] = np.NaN
# Filter data using np.isnan
filtered_data = data[~np.isnan(data)]
# basic plot
plt.boxplot(filtered_data)
plt.show()
For 2D data:
For 2D data, you can't simply use the mask above, since then each column of the data array would have a different length. Instead, we can create a list, with each item in the list being the filtered data for each column of the data array.
A list comprehension can do this in one line: [d[m] for d, m in zip(data.T, mask.T)]
import matplotlib.pyplot as plt
import numpy as np
# fake up some data
np.random.seed(2022) # so the same data is created each time
spread = np.random.rand(50) * 100
center = np.ones(25) * 50
flier_high = np.random.rand(10) * 100 + 100
flier_low = np.random.rand(10) * -100
data = np.concatenate((spread, center, flier_high, flier_low), 0)
data = np.column_stack((data, data * 2., data + 20.))
# Add a NaN
data[30, 0] = np.NaN
data[20, 1] = np.NaN
# Filter data using np.isnan
mask = ~np.isnan(data)
filtered_data = [d[m] for d, m in zip(data.T, mask.T)]
# basic plot
plt.boxplot(filtered_data)
plt.show()
I'll leave it as an exercise to the reader to extend this to 3 or more dimensions, but you get the idea.
Use seaborn, which is a high-level API for matplotlib
seaborn.boxplot filters NaN under the hood
import seaborn as sns
sns.boxplot(data=data)
1D
2D
NaN is also ignored if plotting from df.plot(kind='box') for pandas, which uses matplotlib as the default plotting backend.
import pandas as pd
df = pd.DataFrame(data)
df.plot(kind='box')
1D
2D

Masking a variable with lat and lon but needed 3d array

what i am trying is masking a value from nc file with numpy array, according to specific location but it gives me 1d array and i can not use this array for plotting here my code.
from netCDF4 import Dataset
import matplotlib.pyplot as plt
import numpy as np
import numpy.ma as ma
file = './sample_data/NSS.AMBX.NK.D08214.S0740.E0931.B5312324.WI.nc'
data = Dataset(file,mode='r')
fcdBT89gHz = np.asarray(data.groups['Data_Fields']['fcdr_brightness_temperature_1'][:])
fcdBT150gHz = np.asarray(data.groups['Data_Fields']['fcdr_brightness_temperature_2'][:])
fcdBT183_1gHz = np.asarray(data.groups['Data_Fields']['fcdr_brightness_temperature_3'][:])
fcdBT183_3gHz = np.asarray(data.groups['Data_Fields']['fcdr_brightness_temperature_4'][:])
fcdBT183_7gHz = np.asarray(data.groups['Data_Fields']['fcdr_brightness_temperature_5'][:])
lats = data.groups['Geolocation_Time_Fields']['latitude'] #Enlem degerleri
lons = data.groups['Geolocation_Time_Fields']['longitude'] #Boylam degerleri
latlar = np.asarray(lats[:]) # Lati
lonlar = np.asarray(lons[:]) # Long
lo = ma.masked_outside(lonlar,105,110)
la = ma.masked_outside(latlar,30,35)
merged_coord=~ma.mask_or(la.mask,lo.mask)
h = plt.plot(fcdBT150gHz[merged_coord])
The output is like that but i need latitudes in x axis like this plot
If you need shape of variables:
lo.shape = (2495, 90)
la.shape = (2495, 90)
fcdBT150gHz[merged_coord].shape = (701,)
Maybe i did not use true way for masking. If data is needed here.

Visualizing SOM and adding labels to the map

I have been trying to apply SOM on my dataframe, my dataframe has 25 columns where each column represents a house, each house has a values for power consumption for two years, and I want to cluster the data with number of clusters = 3.
I have done the following:
import sys
sys.path.insert(0, '../')
%load_ext autoreload
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import plot,axis,show,pcolor,colorbar,bone
from matplotlib.patches import Patch
%matplotlib inline
from minisom import MiniSom
from sklearn.preprocessing import minmax_scale, scale
%autoreload 2
data1 = pd.read_excel(r"C:\Users\user\Desktop\Thesis\Tarek\Consumption.xlsx")
data1['h1'] = data1['h1'].str.split(';').str[2].astype('float')
data1['h2'] = data1['h2'].str.split(';').str[2].astype('float')
data1['h3'] = data1['h3'].str.split(';').str[2].astype('float')
data1['h4'] = data1['h4'].str.split(';').str[2].astype('float')
data1['h5'] = data1['h5'].str.split(';').str[2].astype('float')
data1['h6'] = data1['h6'].str.split(';').str[2].astype('float')
data1['h7'] = data1['h7'].str.split(';').str[2].astype('float')
data1['h8'] = data1['h8'].str.split(';').str[2].astype('float')
data1['h9'] = data1['h9'].str.split(';').str[2].astype('float')
data1['h10'] = data1['h10'].str.split(';').str[2].astype('float')
data1['h11'] = data1['h11'].str.split(';').str[2].astype('float')
data1['h12'] = data1['h12'].str.split(';').str[2].astype('float')
data1['h13'] = data1['h13'].str.split(';').str[2].astype('float')
data1['h14'] = data1['h14'].str.split(';').str[2].astype('float')
data1['h15'] = data1['h15'].str.split(';').str[2].astype('float')
data1['h16'] = data1['h16'].str.split(';').str[2].astype('float')
data1['h17'] = data1['h17'].str.split(';').str[2].astype('float')
data1['h18'] = data1['h18'].str.split(';').str[2].astype('float')
data1['h19'] = data1['h19'].str.split(';').str[2].astype('float')
data1['h20'] = data1['h20'].str.split(';').str[2].astype('float')
data1['h21'] = data1['h21'].str.split(';').str[2].astype('float')
data1['h22'] = data1['h22'].str.split(';').str[2].astype('float')
data1['h23'] = data1['h23'].str.split(';').str[2].astype('float')
data1['h24'] = data1['h24'].str.split(';').str[2].astype('float')
data1['h25'] = data1['h25'].str.split(';').str[2].astype('float')
data1.fillna(0,inplace=True)
data1=data1.round(decimals=2)
X=data1.values
som =MiniSom(x=3,y=3,input_len=25,sigma=1.0, learning_rate=0.5)
som.random_weights_init(X)
som.train_batch(data=X ,num_iteration=1000,verbose=True)
bone()
pcolor(som.distance_map().T)
colorbar()
markers = ['o' , 's','v']
colors = ['r', 'g','y']
for i, x in enumerate(X):
w = som.winner(x)
plot(w[0] + 0.5,
w[1] + 0.5,
markers[i],
markeredgecolor = colors[i],
markerfacecolor = 'None',
markersize = 10,
markeredgewidth = 2)
show()
when I am running the code, I am getting this error:
IndexError: list index out of range
please any tips to add the markers and colors in the right way without having any problems, and I would be glad if any one can help, I am a bit new to Python and tried to find a solution but I couldn`t find any.

The problem seems to be that the length of your X=data1.values is around 25 but the length of your markers and colors is only 3. So in the following for loop, when i is 3, you are trying to access markers[3] and colors[3] which throws an IndexError because both markers and colors goes up to index 2 (indexing starts from 0 in python)
for i, x in enumerate(X):
One solution is to define custom list of 25 markers and 25 colors. While you might want to define your own markers, you can leave the colors out and let the code choose automatic colors for the markeredgecolor

How to insert TXT data into netcdf in python

I'm new to python, so I'm sorry if I make any beginner mistakes. I'm trying to insert my text file into a netcdf.
I'm using the netcdf4 package and follow the example in this website: https://pyhogs.github.io/intro_netcdf4.html and I managed to reproduce the example (the example uses random data):
Problem: My text file contains: Lon, Lat , SST and when I try to insert this values, the netcdf file is created, however, it's not correct:
In my code I'm trying to apply a Barnes interpolation (var) or a griddata interpolation (interp).
I think this is what has to enter in my variable netcdf (maybe I'm wrong).
Here my code so far:
import os
import numpy as np
from scipy.interpolate import griddata
import matplotlib.pyplot as plt
import numpy.ma as ma
import netCDF4 as nc4
from numpy.random import uniform, seed
from metpy.interpolate import (interpolate_to_grid, remove_nan_observations, inverse_distance_to_grid, remove_repeat_coordinates)
# Open file
arq_sst = np.loadtxt(fname = "C:\\Users\\Rodrigo\\XYZ.txt", skiprows=0, delimiter=",")
# Getting the Arrays
lonf = arq_sst[:, 0]
latf = arq_sst[:, 1]
sstf = arq_sst[:, 2]
# Atmosphere level
z = [1]
#shapping grid
x_1, y_1 = np.meshgrid(lonf, latf)
#Barnes Interpolation
var = inverse_distance_to_grid(lonf, latf, sstf, x_1, y_1, r=100000, gamma=0.25, kappa=5.052, min_neighbors=3, kind='barnes')
#Or
#Another interpolation
interp = griddata((lonf, latf), sstf, (lonf[None,:], latf[:,None]), method='nearest')
#Open netcdf to write
f = nc4.Dataset('file_created.nc','w', format='NETCDF4')
#Creating group in netcdf file
tempgrp = f.createGroup('SAT_DATA')
#Specifying dimensions
tempgrp.createDimension('lon', len(lonf))
tempgrp.createDimension('lat', len(latf))
tempgrp.createDimension('z', len(z))
tempgrp.createDimension('time', None)
#Building variables
longitude = tempgrp.createVariable('Longitude', 'f4', 'lon')
latitude = tempgrp.createVariable('Latitude', 'f4', 'lat')
levels = tempgrp.createVariable('Levels', 'i4', 'z')
sst = tempgrp.createVariable('sst', 'f4', ('time', 'lon', 'lat', 'z'))
time = tempgrp.createVariable('Time', 'i4', 'time')
#Passing data into variables
longitude[:] = lonf
latitude[:] = latf
levels[:] = z
sst[0,:,:,:] = var
#get time in days since Jan 01,01
from datetime import datetime
today = datetime.today()
time_num = today.toordinal()
time[0] = time_num
#Add global attributes
f.description = "XYZ dataset containing one group"
f.history = "Created " + today.strftime("%d/%m/%y")
#Add local attributes to variable instances
longitude.units = 'degrees east'
latitude.units = 'degrees north'
time.units = 'days since Jan 01, 0001'
sst.units = 'degrees'
levels.units = 'meters'
sst.warning = 'This data is not real!'
#Closing the dataset
f.close()
Here is my text data(Header: Longitude,Latitude,SST). I decreased the number of lines to fit here:
-42.1870,-22.9940,22.4844
-37.4000,-29.9700,20.2000
-37.4200,-29.9600,20.1000
-39.1800,-30.0000,20.5000
-39.2100,-30.0000,20.4000
-39.2300,-30.0000,20.4000
-39.2200,-29.9800,20.4000
-39.2300,-29.9900,20.4000
-39.2000,-29.9800,20.4000
-39.1900,-30.0000,20.5000
-39.2800,-29.9900,20.5000
-39.2700,-29.9900,20.4000
-39.3400,-29.9700,20.5000
-39.3300,-29.9600,20.4000
-39.3100,-29.9600,20.4000
-39.3600,-29.9700,20.6000
-39.3500,-29.9900,20.4000
-39.3900,-29.9900,20.4000
-38.4600,-30.0000,20.3000
-38.4900,-29.9800,20.7000
-37.4800,-29.8800,20.4000
-37.5000,-29.8600,20.3000
-37.4600,-29.8900,20.3000
-41.3800,-29.9900,20.0000
-41.4000,-29.9900,20.1000
-41.0400,-29.9300,20.1000
-41.0200,-29.9200,20.2000
-41.0600,-29.9300,20.1000
-41.1000,-29.9400,19.9000
-41.0900,-29.9600,19.9000
-41.1100,-29.9800,19.9000
-41.1100,-29.9600,20.0000
-41.1200,-29.9400,20.0000
-41.1400,-29.9400,20.0000
-41.1600,-29.9500,20.1000
-41.1700,-29.9500,20.1000
-41.1900,-29.9700,20.0000
-41.1900,-29.9500,20.1000
-40.6800,-29.9900,20.1000
-40.7400,-29.9600,20.1000
-40.7700,-29.9700,20.1000
-40.7800,-29.9700,20.1000
-40.7100,-29.9000,20.1000
-40.7600,-29.9100,20.1000
-40.7400,-29.9000,20.1000
-40.7200,-29.9000,20.2000
-40.7600,-29.9200,20.1000
-40.7500,-29.9400,20.1000
-40.7800,-29.9100,20.2000
-40.8000,-29.9100,20.2000
-40.8100,-29.9300,20.1000
-40.8200,-29.9200,20.2000
-40.7900,-29.9300,20.2000
-40.7900,-29.9500,20.1000
-40.7700,-29.9300,20.1000
-40.8400,-29.9600,20.2000
-40.8600,-29.9600,20.3000
-40.9000,-29.9100,20.1000
-40.9100,-29.9100,20.0000
-40.3900,-29.9400,20.0000
-40.3900,-29.9200,20.0000
-40.4100,-29.9200,20.0000
-40.4100,-29.9400,20.0000
-40.3800,-29.9000,20.0000
-40.3800,-29.9200,20.0000
-40.4000,-29.9000,20.1000
-40.3700,-29.9600,20.0000
-40.3600,-29.9700,20.0000
-40.3800,-29.9800,20.0000
-40.4200,-29.9000,20.0000
-40.4300,-29.9300,20.1000
-40.4500,-29.9300,20.1000
-40.4700,-29.9300,20.0000
-40.4400,-29.9100,20.0000
-40.4500,-29.9100,20.0000
-40.4700,-29.9100,20.0000
-40.5000,-29.9400,19.9000
-40.5300,-29.9200,20.1000
-40.5100,-29.9200,20.1000
-40.4900,-29.9400,19.9000
-40.4900,-29.9200,20.0000
-40.6200,-30.0000,20.2000
-40.6000,-30.0000,20.1000
-40.6800,-29.9900,20.1000
-40.4000,-29.8400,20.1000
-40.4800,-29.8700,20.1000
-40.4500,-29.8300,20.3000
-40.4600,-29.8900,20.1000
-40.4600,-29.8700,20.0000
-40.5000,-29.8800,20.3000
-40.4900,-29.9000,20.1000
-40.5100,-29.9000,20.3000
-40.5300,-29.9000,20.2000
-40.5600,-29.8500,20.3000
-40.5800,-29.8500,20.3000
-40.6300,-29.9000,19.9000
-40.7100,-29.9000,20.1000
-40.0500,-29.9600,20.3000
-40.1100,-29.9800,20.2000
-40.1100,-30.0000,20.2000
Can anybody help me?

So there are a couple of things. First of all, you are not providing the correct equally spaced dimensions for the interpolation and the resulting netCDF file. This is how I created the space for the meshgrid, (I chose a linear space of 100 but depending on what resolution you want your data you may want to change this to whatever suits your purpose):
spacing_x = np.linspace(np.min(lonf),np.max(lonf),100)
spacing_y = np.linspace(np.min(latf),np.max(latf),100)
x_1, y_1 = np.meshgrid(spacing_x, spacing_y)
Then doing the interpolation as follows:
#Barnes Interpolation
var = inverse_distance_to_grid(lonf, latf, sstf, x_1, y_1, r=100000, gamma=0.25, kappa=5.052, min_neighbors=3, kind='barnes')
#Or
#Another interpolation
interp = griddata((lonf, latf), sstf, (x_1, y_1), method='nearest')
Finally you will want to add the linear spaces as the latitude and longitude dimensions since the interpolated data is being broadcasted to them:
#Passing data into variables
longitude[:] = x_1[0]
latitude[:] = y_1[:,0]
Another note is that for Panoply or other software to show your data in a Geo2D format, you will want to name your lat lon dimensions the same as your variables. The full code is below:
import os
import numpy as np
from scipy.interpolate import griddata
import matplotlib.pyplot as plt
import numpy.ma as ma
import netCDF4 as nc4
from numpy.random import uniform, seed
from metpy.interpolate import (interpolate_to_grid, remove_nan_observations, inverse_distance_to_grid, remove_repeat_coordinates)
# Open file
arq_sst = np.loadtxt(fname = r"C:\Users\Rodrigo\XYZ.txt", skiprows=0, delimiter=",")
# Getting the Arrays
lonf = arq_sst[:, 0]
latf = arq_sst[:, 1]
sstf = arq_sst[:, 2]
# Atmosphere level
z = [1]
#shapping grid
spacing_x = np.linspace(np.min(lonf),np.max(lonf),100)
spacing_y = np.linspace(np.min(latf),np.max(latf),100)
x_1, y_1 = np.meshgrid(spacing_x, spacing_y)
#Barnes Interpolation
var = inverse_distance_to_grid(lonf, latf, sstf, x_1, y_1, r=100000, gamma=0.25, kappa=5.052, min_neighbors=3, kind='barnes')
#Or
#Another interpolation
interp = griddata((lonf, latf), sstf, (x_1, y_1), method='nearest')
#Open netcdf to write
f = nc4.Dataset('file_created.nc','w', format='NETCDF4')
#Creating group in netcdf file
tempgrp = f.createGroup('SAT_DATA')
#Specifying dimensions
tempgrp.createDimension('longitude', len(spacing_x))
tempgrp.createDimension('latitude', len(spacing_y))
tempgrp.createDimension('z', len(z))
tempgrp.createDimension('time', None)
#Building variables
longitude = tempgrp.createVariable('longitude', 'f8', 'longitude', fill_value=np.nan)
latitude = tempgrp.createVariable('latitude', 'f8', 'latitude', fill_value=np.nan)
levels = tempgrp.createVariable('z', 'i4', 'z')
sst = tempgrp.createVariable('sst', 'f8', ('time','longitude','latitude','z'), fill_value=np.nan)
time = tempgrp.createVariable('time', 'f8', 'time', fill_value=np.nan)
#Passing data into variables
longitude[:] = x_1[0]
latitude[:] = y_1[:,0]
levels[:] = z
sst[0,:,:,:] = var
#get time in days since Jan 01,01
from datetime import datetime
today = datetime.today()
time_num = today.toordinal()
time[0] = time_num
#Add global attributes
f.description = "XYZ dataset containing one group"
f.history = "Created " + today.strftime("%d/%m/%y")
#Add local attributes to variable instances
longitude.units = 'degrees_east'
longitude.point_spacing = "even";
longitude._CoordinateAxisType = "Lon";
latitude.units = 'degrees_north'
latitude.point_spacing = "even";
latitude._CoordinateAxisType = "Lat";
time.units = "days since Jan 01, 0001";
time._ChunkSizes = [1]
sst.long_name = "SEA SURFACE TEMPERATURE"
sst.history = "From coads_climatology"
sst.units = "Deg C";
sst.missing_value = -1.0
sst._ChunkSizes = [1, 100, 100]
levels.units = 'meters'
sst.warning = 'This data is not real!'
#Closing the dataset
f.close()
Let me know if you have any questions.

world map without rivers with matplotlib / Basemap?

Would there be a way to plot the borders of the continents with Basemap (or without Basemap, if there is some other way), without those annoying rivers coming along? Especially that piece of Kongo River, not even reaching the ocean, is disturbing.
EDIT: I intend to further plot data over the map, like in the Basemap gallery (and still have the borderlines of the continents drawn as black lines over the data, to give structure for the worldmap) so while the solution by Hooked below is nice, masterful even, it's not applicable for this purpose.
Image produced by:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8, 4.5))
plt.subplots_adjust(left=0.02, right=0.98, top=0.98, bottom=0.00)
m = Basemap(projection='robin',lon_0=0,resolution='c')
m.fillcontinents(color='gray',lake_color='white')
m.drawcoastlines()
plt.savefig('world.png',dpi=75)

For reasons like this i often avoid Basemap alltogether and read the shapefile in with OGR and convert them to a Matplotlib artist myself. Which is alot more work but also gives alot more flexibility.
Basemap has some very neat features like converting the coordinates of input data to your 'working projection'.
If you want to stick with Basemap, get a shapefile which doesnt contain the rivers. Natural Earth for example has a nice 'Land' shapefile in the physical section (download 'scale rank' data and uncompress). See http://www.naturalearthdata.com/downloads/10m-physical-vectors/
You can read the shapefile in with the m.readshapefile() method from Basemap. This allows you to get the Matplotlib Path vertices and codes in the projection coordinates which you can then convert into a new Path. Its a bit of a detour but it gives you all styling options from Matplotlib, most of which are not directly available via Basemap. Its a bit hackish, but i dont now another way while sticking to Basemap.
So:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
from matplotlib.collections import PathCollection
from matplotlib.path import Path
fig = plt.figure(figsize=(8, 4.5))
plt.subplots_adjust(left=0.02, right=0.98, top=0.98, bottom=0.00)
# MPL searches for ne_10m_land.shp in the directory 'D:\\ne_10m_land'
m = Basemap(projection='robin',lon_0=0,resolution='c')
shp_info = m.readshapefile('D:\\ne_10m_land', 'scalerank', drawbounds=True)
ax = plt.gca()
ax.cla()
paths = []
for line in shp_info[4]._paths:
paths.append(Path(line.vertices, codes=line.codes))
coll = PathCollection(paths, linewidths=0, facecolors='grey', zorder=2)
m = Basemap(projection='robin',lon_0=0,resolution='c')
# drawing something seems necessary to 'initiate' the map properly
m.drawcoastlines(color='white', zorder=0)
ax = plt.gca()
ax.add_collection(coll)
plt.savefig('world.png',dpi=75)
Gives:

How to remove "annoying" rivers:
If you want to post-process the image (instead of working with Basemap directly) you can remove bodies of water that don't connect to the ocean:
import pylab as plt
A = plt.imread("world.png")
import numpy as np
import scipy.ndimage as nd
import collections
# Get a counter of the greyscale colors
a = A[:,:,0]
colors = collections.Counter(a.ravel())
outside_and_water_color, land_color = colors.most_common(2)
# Find the contigous landmass
land_idx = a == land_color[0]
# Index these land masses
L = np.zeros(a.shape,dtype=int)
L[land_idx] = 1
L,mass_count = nd.measurements.label(L)
# Loop over the land masses and fill the "holes"
# (rivers without outlays)
L2 = np.zeros(a.shape,dtype=int)
L2[land_idx] = 1
L2 = nd.morphology.binary_fill_holes(L2)
# Remap onto original image
new_land = L2==1
A2 = A.copy()
c = [land_color[0],]*3 + [1,]
A2[new_land] = land_color[0]
# Plot results
plt.subplot(221)
plt.imshow(A)
plt.axis('off')
plt.subplot(222)
plt.axis('off')
B = A.copy()
B[land_idx] = [1,0,0,1]
plt.imshow(B)
plt.subplot(223)
L = L.astype(float)
L[L==0] = None
plt.axis('off')
plt.imshow(L)
plt.subplot(224)
plt.axis('off')
plt.imshow(A2)
plt.tight_layout() # Only with newer matplotlib
plt.show()
The first image is the original, the second identifies the land mass. The third is not needed but fun as it ID's each separate contiguous landmass. The fourth picture is what you want, the image with the "rivers" removed.

Following user1868739's example, I am able to select only the paths (for some lakes) that I want:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8, 4.5))
plt.subplots_adjust(left=0.02, right=0.98, top=0.98, bottom=0.00)
m = Basemap(resolution='c',projection='robin',lon_0=0)
m.fillcontinents(color='white',lake_color='white',zorder=2)
coasts = m.drawcoastlines(zorder=1,color='white',linewidth=0)
coasts_paths = coasts.get_paths()
ipolygons = range(83) + [84] # want Baikal, but not Tanganyika
# 80 = Superior+Michigan+Huron, 81 = Victoria, 82 = Aral, 83 = Tanganyika,
# 84 = Baikal, 85 = Great Bear, 86 = Great Slave, 87 = Nyasa, 88 = Erie
# 89 = Winnipeg, 90 = Ontario
for ipoly in ipolygons:
r = coasts_paths[ipoly]
# Convert into lon/lat vertices
polygon_vertices = [(vertex[0],vertex[1]) for (vertex,code) in
r.iter_segments(simplify=False)]
px = [polygon_vertices[i][0] for i in xrange(len(polygon_vertices))]
py = [polygon_vertices[i][2] for i in xrange(len(polygon_vertices))]
m.plot(px,py,linewidth=0.5,zorder=3,color='black')
plt.savefig('world2.png',dpi=100)
But this only works when using white background for the continents. If I change color to 'gray' in the following line, we see that other rivers and lakes are not filled with the same color as the continents are. (Also playing with area_thresh will not remove those rivers that are connected to ocean.)
m.fillcontinents(color='gray',lake_color='white',zorder=2)
The version with white background is adequate for further color-plotting all kind of land information over the continents, but a more elaborate solution would be needed, if one wants to retain the gray background for continents.

I frequently modify Basemap's drawcoastlines() to avoid those 'broken' rivers. I also modify drawcountries() for the sake of data source consistency.
Here is what I use in order to support the different resolutions available in Natural Earth data:
from mpl_toolkits.basemap import Basemap
class Basemap(Basemap):
""" Modify Basemap to use Natural Earth data instead of GSHHG data """
def drawcoastlines(self):
shapefile = 'data/naturalearth/coastline/ne_%sm_coastline' % \
{'l':110, 'm':50, 'h':10}[self.resolution]
self.readshapefile(shapefile, 'coastline', linewidth=1.)
def drawcountries(self):
shapefile = 'data/naturalearth/countries/ne_%sm_admin_0_countries' % \
{'l':110, 'm':50, 'h':10}[self.resolution]
self.readshapefile(shapefile, 'countries', linewidth=0.5)
m = Basemap(llcrnrlon=-90, llcrnrlat=-40, urcrnrlon=-30, urcrnrlat=+20,
resolution='l') # resolution = (l)ow | (m)edium | (h)igh
m.drawcoastlines()
m.drawcountries()
Here is the output:
Please note that by default Basemap uses resolution='c' (crude), which is not supported in the code shown.

If you're OK with plotting outlines rather than shapefiles, it's pretty easy to plot coastlines that you can get from wherever. I got my coastlines from the NOAA Coastline Extractor in MATLAB format:
http://www.ngdc.noaa.gov/mgg/shorelines/shorelines.html
To edit the coastlines, I converted to SVG, then edited them with Inkscape, then converted back to the lat/lon text file ("MATLAB" format).
All Python code is included below.
# ---------------------------------------------------------------
def plot_lines(mymap, lons, lats, **kwargs) :
"""Plots a custom coastline. This plots simple lines, not
ArcInfo-style SHAPE files.
Args:
lons: Longitude coordinates for line segments (degrees E)
lats: Latitude coordinates for line segments (degrees N)
Type Info:
len(lons) == len(lats)
A NaN in lons and lats signifies a new line segment.
See:
giss.noaa.drawcoastline_file()
"""
# Project onto the map
x, y = mymap(lons, lats)
# BUG workaround: Basemap projects our NaN's to 1e30.
x[x==1e30] = np.nan
y[y==1e30] = np.nan
# Plot projected line segments.
mymap.plot(x, y, **kwargs)
# Read "Matlab" format files from NOAA Coastline Extractor.
# See: http://www.ngdc.noaa.gov/mgg/coast/
lineRE=re.compile('(.*?)\s+(.*)')
def read_coastline(fname, take_every=1) :
nlines = 0
xdata = array.array('d')
ydata = array.array('d')
for line in file(fname) :
# if (nlines % 10000 == 0) :
# print 'nlines = %d' % (nlines,)
if (nlines % take_every == 0 or line[0:3] == 'nan') :
match = lineRE.match(line)
lon = float(match.group(1))
lat = float(match.group(2))
xdata.append(lon)
ydata.append(lat)
nlines = nlines + 1
return (np.array(xdata),np.array(ydata))
def drawcoastline_file(mymap, fname, **kwargs) :
"""Reads and plots a coastline file.
See:
giss.basemap.drawcoastline()
giss.basemap.read_coastline()
"""
lons, lats = read_coastline(fname, take_every=1)
return drawcoastline(mymap, lons, lats, **kwargs)
# =========================================================
# coastline2svg.py
#
import giss.io.noaa
import os
import numpy as np
import sys
svg_header = """<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
version="1.1"
width="360"
height="180"
id="svg2">
<defs
id="defs4" />
<metadata
id="metadata7">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
id="layer1">
"""
path_tpl = """
<path
d="%PATH%"
id="%PATH_ID%"
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
"""
svg_footer = "</g></svg>"
# Set up paths
data_root = os.path.join(os.environ['HOME'], 'data')
#modelerc = giss.modele.read_modelerc()
#cmrun = modelerc['CMRUNDIR']
#savedisk = modelerc['SAVEDISK']
ifname = sys.argv[1]
ofname = ifname.replace('.dat', '.svg')
lons, lats = giss.io.noaa.read_coastline(ifname, 1)
out = open(ofname, 'w')
out.write(svg_header)
path_id = 1
points = []
for lon, lat in zip(lons, lats) :
if np.isnan(lon) or np.isnan(lat) :
# Process what we have
if len(points) > 2 :
out.write('\n<path d="')
out.write('m %f,%f L' % (points[0][0], points[0][1]))
for pt in points[1:] :
out.write(' %f,%f' % pt)
out.write('"\n id="path%d"\n' % (path_id))
# out.write('style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"')
out.write(' />\n')
path_id += 1
points = []
else :
lon += 180
lat = 180 - (lat + 90)
points.append((lon, lat))
out.write(svg_footer)
out.close()
# =============================================================
# svg2coastline.py
import os
import sys
import re
# Reads the output of Inkscape's "Plain SVG" format, outputs in NOAA MATLAB coastline format
mainRE = re.compile(r'\s*d=".*"')
lineRE = re.compile(r'\s*d="(m|M)\s*(.*?)"')
fname = sys.argv[1]
lons = []
lats = []
for line in open(fname, 'r') :
# Weed out extraneous lines in the SVG file
match = mainRE.match(line)
if match is None :
continue
match = lineRE.match(line)
# Stop if something is wrong
if match is None :
sys.stderr.write(line)
sys.exit(-1)
type = match.group(1)[0]
spairs = match.group(2).split(' ')
x = 0
y = 0
for spair in spairs :
if spair == 'L' :
type = 'M'
continue
(sdelx, sdely) = spair.split(',')
delx = float(sdelx)
dely = float(sdely)
if type == 'm' :
x += delx
y += dely
else :
x = delx
y = dely
lon = x - 180
lat = 90 - y
print '%f\t%f' % (lon, lat)
print 'nan\tnan'

Okay I think I have a partial solution.
The basic idea is that the paths used by drawcoastlines() are ordered by the size/area. Which means the first N paths are (for most applications) the main land masses and lakes and the later paths the smaller islands and rivers.
The issue is that the first N paths that you want will depend on the projection (e.g., global, polar, regional), if area_thresh has been applied and whether you want lakes or small islands etc. In other words, you will have to tweak this per application.
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
mp = 'cyl'
m = Basemap(resolution='c',projection=mp,lon_0=0,area_thresh=200000)
fill_color = '0.9'
# If you don't want lakes set lake_color to fill_color
m.fillcontinents(color=fill_color,lake_color='white')
# Draw the coastlines, with a thin line and same color as the continent fill.
coasts = m.drawcoastlines(zorder=100,color=fill_color,linewidth=0.5)
# Exact the paths from coasts
coasts_paths = coasts.get_paths()
# In order to see which paths you want to retain or discard you'll need to plot them one
# at a time noting those that you want etc.
for ipoly in xrange(len(coasts_paths)):
print ipoly
r = coasts_paths[ipoly]
# Convert into lon/lat vertices
polygon_vertices = [(vertex[0],vertex[1]) for (vertex,code) in
r.iter_segments(simplify=False)]
px = [polygon_vertices[i][0] for i in xrange(len(polygon_vertices))]
py = [polygon_vertices[i][1] for i in xrange(len(polygon_vertices))]
m.plot(px,py,'k-',linewidth=1)
plt.show()
Once you know the relevant ipoly to stop drawing (poly_stop) then you can do something like this...
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
mproj = ['nplaea','cyl']
mp = mproj[0]
if mp == 'nplaea':
m = Basemap(resolution='c',projection=mp,lon_0=0,boundinglat=30,area_thresh=200000,round=1)
poly_stop = 10
else:
m = Basemap(resolution='c',projection=mp,lon_0=0,area_thresh=200000)
poly_stop = 18
fill_color = '0.9'
# If you don't want lakes set lake_color to fill_color
m.fillcontinents(color=fill_color,lake_color='white')
# Draw the coastlines, with a thin line and same color as the continent fill.
coasts = m.drawcoastlines(zorder=100,color=fill_color,linewidth=0.5)
# Exact the paths from coasts
coasts_paths = coasts.get_paths()
# In order to see which paths you want to retain or discard you'll need to plot them one
# at a time noting those that you want etc.
for ipoly in xrange(len(coasts_paths)):
if ipoly > poly_stop: continue
r = coasts_paths[ipoly]
# Convert into lon/lat vertices
polygon_vertices = [(vertex[0],vertex[1]) for (vertex,code) in
r.iter_segments(simplify=False)]
px = [polygon_vertices[i][0] for i in xrange(len(polygon_vertices))]
py = [polygon_vertices[i][1] for i in xrange(len(polygon_vertices))]
m.plot(px,py,'k-',linewidth=1)
plt.show()

As per my comment to #sampo-smolander
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8, 4.5))
plt.subplots_adjust(left=0.02, right=0.98, top=0.98, bottom=0.00)
m = Basemap(resolution='c',projection='robin',lon_0=0)
m.fillcontinents(color='gray',lake_color='white',zorder=2)
coasts = m.drawcoastlines(zorder=1,color='white',linewidth=0)
coasts_paths = coasts.get_paths()
ipolygons = range(83) + [84]
for ipoly in xrange(len(coasts_paths)):
r = coasts_paths[ipoly]
# Convert into lon/lat vertices
polygon_vertices = [(vertex[0],vertex[1]) for (vertex,code) in
r.iter_segments(simplify=False)]
px = [polygon_vertices[i][0] for i in xrange(len(polygon_vertices))]
py = [polygon_vertices[i][1] for i in xrange(len(polygon_vertices))]
if ipoly in ipolygons:
m.plot(px,py,linewidth=0.5,zorder=3,color='black')
else:
m.plot(px,py,linewidth=0.5,zorder=4,color='grey')
plt.savefig('world2.png',dpi=100)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Reverse Array in a dataframe - python

Related

boxplot structure disappears when pandas contains nan [duplicate]

Masking a variable with lat and lon but needed 3d array

Visualizing SOM and adding labels to the map

How to insert TXT data into netcdf in python

world map without rivers with matplotlib / Basemap?

Categories

Resources