How to write datetime information to netcdf4? - python

Essentially, I would like to open a netcdf file, read out the time stamps for individual pixels and then write the timestamps into a new file. Here is my pseudo-code:
f10 = Dataset(nc_f10, 'r')
Time_UTC_10 = np.transpose(f10.variables['TIME_UTC'][:]) #shape is [92,104]
radiance_10 = f10.variables['RADIANCE'][:] #shape is [92,104]
f10.close()
#Manipulate Radiance Information
#python separates the characters in the timestamp, so join it back up:
for i in np.arange(92):
for j in np.arange(104):
joined_16 = ''.join(Time_UTC_16[:,i,j])
datetime_16[i,j] = datetime.datetime.strptime(joined_16, '%Y-%m-%dT%H:%M:%S.%fZ')
#Create and fill the netcdf
nc_out = Dataset(output_directory+nc_out_file, 'w', format='NETCDF4')
y = nc_out.createDimension('y',104)
x = nc_out.createDimension('x',92)
times = nc_out.createVariable('time', np.unicode_, ('x','y'))
O5s = nc_out.createVariable('O5s', np.float32, ('x', 'y'))
times[:] = datetime_16
O5s[:] = radiance_10
nc_out.close()
But when I try to run this, I get the following error:
TypeError: only numpy string, unicode or object arrays can be assigned to VLEN str var slices
I feel like I may be misunderstanding something important here. Any thoughts on how I can correct this code to write the timestamps to a variable in a netcdf?

I really do not know why you want to keep your time variables as a string (this is what the error message says: the values can be either strings, unicode or objects), but one example is like this:
#!/usr/bin/env ipython
# ----------------------
import numpy as np
from netCDF4 import Dataset,num2date,date2num
# ----------------------
ny=104;
nx=92
# ----------------------
radiance_10=np.random.random((ny,nx));
datetime_16=np.ones((ny,nx))
# ----------------------
nc_out = Dataset('test.nc', 'w', format='NETCDF4')
y = nc_out.createDimension('y',ny)
x = nc_out.createDimension('x',nx)
times = nc_out.createVariable('time', np.unicode_, ('x','y'))
O5s = nc_out.createVariable('O5s', np.float32, ('x', 'y'))
O5s[:] = radiance_10
for ii in range(ny):
for jj in range(nx):
times[jj,ii] = "2011-01-01 00:00:00"
nc_out.close()
Basically the values that are written to the time variable are now strings with value at every grid point "2011-01-01 00:00:00".
Nevertheless, I would use timevalues as time elapsed from arbitarily selected timemoment. That is the most common way how to keep time in the netCDF file. Let us assume our data in every point is for time moment 2014-04-11 23:59. Then I could save it as seconds since 2014-04-01. Here is the code that I would use:
import numpy as np
from netCDF4 import Dataset,num2date,date2num
import datetime
# ----------------------
ny=104;
nx=92
# ----------------------
radiance_10=np.random.random((ny,nx));
# ---------------------------------------------------
timevalue = datetime.datetime(2014,4,11,23,59)
time_unit_out= "seconds since 2014-04-01 00:00:00"
# ---------------------------------------------------
nc_out = Dataset('test_b.nc', 'w', format='NETCDF4')
y = nc_out.createDimension('y',ny)
x = nc_out.createDimension('x',nx)
times = nc_out.createVariable('time', np.float64, ('x','y'))
times.setncattr('unit',time_unit_out);
O5s = nc_out.createVariable('O5s', np.float32, ('x', 'y'))
O5s[:] = radiance_10
times[:] = date2num(timevalue,time_unit_out);
nc_out.close()
If you check the value that is now in the time variable, it is 950340, which is the number of seconds from 2014-04-01 00:00 to 2014-04-11 23:59.

Related

Saving data in Python

I am trying to save the data in a CSV format. The current and desired outputs are attached.
import numpy as np
import csv
r = np.linspace(0, 100e-6, 5)
A=np.array([[23.9496871440374 - 1336167292.56833*r**2],
[21.986288555672 - 1373636804.80965*r**2]])
with open('Vel_Profiles.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows((r,A))
The current output is
The desired output is
Here is what worked for me to get your expected output:
import numpy as np
import csv
r = np.linspace(0, 100e-6, 5)
A=np.array([[23.9496871440374 - 1336167292.56833*r**2],
[21.986288555672 - 1373636804.80965*r**2]])
out = np.vstack([r,A.squeeze()]).T
np.savetxt('Vel_Profiles.csv', out, delimiter=',', fmt=['%2.2E', '%.5f', '%.6f'])
output:
0.00E+00 23.94969 21.986289
2.50E-05 23.11458 21.127766
5.00E-05 20.60927 18.552197
7.50E-05 16.43375 14.259582
1.00E-04 10.58801 8.249921
UPDATE
Specifying the format of all columns in a more general way like asked in the comments
r = np.linspace(0, 100e-6, 5)
A=np.array([[23.9496871440374 - 1336167292.56833*r**2],
[21.986288555672 - 1373636804.80965*r**2]])
out = np.vstack([r,A.squeeze()]).T
test = np.hstack([out,out,out])
print(test.shape)
# (5, 9)
# build list of same len than shape[1] with format
# here , we would have 3 times the same data next to each other so just multiply it by 3
my_format = ['%2.2E', '%.5f', '%.6f']
my_list_of_formats = my_format*3
# ['%2.2E', '%.5f', '%.6f', '%2.2E', '%.5f', '%.6f', '%2.2E', '%.5f', '%.6f']
#or like this:
my_list_of_formats = [my_format[i % 3] for i in range(test.shape[1])]
# ['%2.2E', '%.5f', '%.6f', '%2.2E', '%.5f', '%.6f', '%2.2E', '%.5f', '%.6f']
np.savetxt('Vel_Profiles.csv', test, delimiter=',', fmt=my_list_of_formats)
you can also specify just one format like '%2.2E' to fmt=, then every column gets formatted that way
You don't need to use another library, you can use Numpy itself.
You can do this:
import numpy as np
np.savetxt('file_name.csv', your_array, delimiter=',')
If you need to stack your arrays first you can do something like this first:
array = np.vstack([r, A])
Check out the documentation here:
savetxt: https://numpy.org/doc/stable/reference/generated/numpy.savetxt.html
vstack: https://numpy.org/doc/stable/reference/generated/numpy.vstack.html

Python: Netcdf: Is there a method to get the overall average from one variable where another variable overlaps with a unique value?

I have a netcdf file with a 3D int32 variable called tag (shape time lat lon) and a 3D float64 variable called p (shape time lat lon). The shape sizes are identical for both variables. The integer values for the tag variable have a starting value at 0, and an unknown number for its ending value (they are monotonically increasing). The 0 value is not needed, so I would like to begin an overall (space time) average the p var where tag value = 1 through the largest tag value n.
Example (array space (time, lat, lon)): The first integer tag value is 1. This value occurs at say (0,45,45) and (1,45,46). The p values at those tag = 1 array spaces are say 2 and 4. So the averaged result should be equal to 3. The next integer tag value is 2. This value occurs at say (2,100,99), (2,101,99), and (3,101,98), with p values at those array spaces equal to 3, 8, and 1. So the averaged result should be equal to 4. The last integer value is n. This value occurs at say (360,200,100), (361,200,100), (361,201,100), and (361(202,100), with p values at those array spaces equal to 1, 1, 5, and 9. So the averaged result should be equal to 4. When these are written to a text file, it should look like:
3
4
.
.
4
The python code below reads the netcdf files and variables:
import datetime as dt # Python standard library datetime module
import numpy as np
from netCDF4 import Dataset # http://code.google.com/p/netcdf4-python/
def ncdump(nc_fid, verb=True):
'''
ncdump outputs dimensions, variables and their attribute information.
The information is similar to that of NCAR's ncdump utility.
ncdump requires a valid instance of Dataset.
Parameters
----------
nc_fid : netCDF4.Dataset
A netCDF4 dateset object
verb : Boolean
whether or not nc_attrs, nc_dims, and nc_vars are printed
Returns
-------
nc_attrs : list
A Python list of the NetCDF file global attributes
nc_dims : list
A Python list of the NetCDF file dimensions
nc_vars : list
A Python list of the NetCDF file variables
'''
def print_ncattr(key):
"""
Prints the NetCDF file attributes for a given key
Parameters
----------
key : unicode
a valid netCDF4.Dataset.variables key
"""
try:
print "\t\ttype:", repr(nc_fid.variables[key].dtype)
for ncattr in nc_fid.variables[key].ncattrs():
print '\t\t%s:' % ncattr,\
repr(nc_fid.variables[key].getncattr(ncattr))
except KeyError:
print "\t\tWARNING: %s does not contain variable attributes" % key
# NetCDF global attributes
nc_attrs = nc_fid.ncattrs()
if verb:
print "NetCDF Global Attributes:"
for nc_attr in nc_attrs:
print '\t%s:' % nc_attr, repr(nc_fid.getncattr(nc_attr))
nc_dims = [dim for dim in nc_fid.dimensions] # list of nc dimensions
# Dimension shape information.
if verb:
print "NetCDF dimension information:"
for dim in nc_dims:
print "\tName:", dim
print "\t\tsize:", len(nc_fid.dimensions[dim])
print_ncattr(dim)
# Variable information.
nc_vars = [var for var in nc_fid.variables] # list of nc variables
if verb:
print "NetCDF variable information:"
for var in nc_vars:
if var not in nc_dims:
print '\tName:', var
print "\t\tdimensions:", nc_fid.variables[var].dimensions
print "\t\tsize:", nc_fid.variables[var].size
print_ncattr(var)
return nc_attrs, nc_dims, nc_vars
nc_f = './tag.nc' # Your filename
nc_fid = Dataset(nc_f, 'r') # Dataset is the class behavior to open the file
# and create an instance of the ncCDF4 class
nc_attrs, nc_dims, nc_vars = ncdump(nc_fid)
# Extract data from NetCDF file
lats = nc_fid.variables['lat'][:] # extract/copy the data
lons = nc_fid.variables['lon'][:]
time = nc_fid.variables['time'][:]
tag = nc_fid.variables['tag'][:] # shape is time, lat, lon as shown above
nc_p = '../p/p.nc' # Your filename
nc_fid = Dataset(nc_p, 'r') # Dataset is the class behavior to open the file
# and create an instance of the ncCDF4 class
nc_attrs, nc_dims, nc_vars = ncdump(nc_fid)
p = nc_fid.variables['p'][:] # shape is time, lat, lon as shown above
This code returns:
NetCDF Global Attributes:
NetCDF dimension information:
Name: time
size: 365
type: dtype('float64')
axis: u'T'
calendar: u'standard'
standard_name: u'time'
units: u'hours since 1800-01-01 00:00'
Name: lat
size: 287
type: dtype('float64')
long_name: u'latitude'
units: u'degrees_north'
standard_name: u'latitude'
axis: u'Y'
Name: lon
size: 612
type: dtype('float64')
long_name: u'longitude'
units: u'degrees_east'
standard_name: u'longitude'
axis: u'X'
NetCDF variable information:
Name: tag
dimensions: (u'time', u'lat', u'lon')
size: 64110060
type: dtype('int32')
I have been playing around with pandas groupby function, but I have not found something that works for my example yet.
I found a solution that works quickly. Checking the results, they are correct.
Using xarray to open the data, I then converted the data to a dataframe. After that I could use pandas groupby to do the computation.
from pylab import *
import numpy as np
import pandas as pd
import xarray as xr
import netCDF4
# Open data with xarray
dt = xr.open_mfdataset(['../tag.nc', '../p/p.nc'], combine='by_coords')
# Convert to data frame
dtdf = dt.to_dataframe()
dm = {'p': ['mean']}
mean = dtdf.groupby('tag').agg(dm)
mean.columns = ['_'.join(col) for col in mean.columns.values]
p_mean = mean.loc[1:, 'p_mean']

converting a 1d array to netcdf

I have a 1d array which is a time series hourly dataset encompassing 49090 points which needs to be converted to netcdf format.
In the code below, result_u2 is a 1d array which stores result from a for loop. It has 49090 datapoints.
nhours = 49091;#one added to no of datapoints
unout.units = 'hours since 2012-10-20 00:00:00'
unout.calendar = 'gregorian'
ncout = Dataset('output.nc','w','NETCDF3');
ncout.createDimension('time',nhours);
datesout = [datetime.datetime(2012,10,20,0,0,0)+n*timedelta(hours=1) for n in range(nhours)]; # create datevalues
timevar = ncout.createVariable('time','float64',('time'));timevar.setncattr('units',unout);timevar[:]=date2num(datesout,unout);
winds = ncout.createVariable('winds','float32',('time',));winds.setncattr('units','m/s');winds[:] = result_u2;
ncout.close()
I'm new to programming. The code I tried above should be able to write the nc file but while running the script no nc file is being created. Please help.
My suggestions would be to have a look at Python syntax in general, if you want to use it / the netCDF4 package. E.g. there are no semicolons in Python code.
Check out the API documentation - the tutorial you find there basically covers what you're asking. Then, your code could look like
import datetime
import netCDF4
# using "with" syntax so you don't have to do the cleanup:
with netCDF4.Dataset('output.nc', 'w', format='NETCDF3_CLASSIC') as ncout:
# create time dimension
nhours = 49091
time = ncout.createDimension('time', nhours)
# create the time variable
times = ncout.createVariable('time', 'f8', ('time',))
times.units = 'hours since 2012-10-20 00:00:00'
times.calendar = 'gregorian'
# fill time
dates = [datetime.datetime(2012,10,20,0,0,0)+n*datetime.timedelta(hours=1) for n in range(nhours)]
times[:] = netCDF4.date2num(dates, units=times.units, calendar=times.calendar)
# create variable 'wind', dependent on time
wind = ncout.createVariable('wind', 'f8', ('time',))
wind.units = 'm/s'
# fill with data, using your 1d array here:
wind[:] = result_u2

Spatial temporal query in python with many records

I have a dataframe of 600 000 x/y points with date-time information, along another field 'status', with extra descriptive information
My objective is, for each record:
sum column 'status' by records that are within a certain spatial temporal buffer
the specific buffer is within t - 8 hours and < 100 meters
Currently I have the data in a pandas data frame.
I could, loop through the rows, and for each record, subset the dates of interest, then calculate a distances and restrict the selection further. However that would still be quite slow with so many records.
THIS TAKES 4.4 hours to run.
I can see that I could create a 3 dimensional kdtree with x, y, date as epoch time. However, I am not certain how to restrict the distances properly when incorporating dates and geographic distances.
Here is some reproducible code for you guys to test on:
Import
import numpy.random as npr
import numpy
import pandas as pd
from pandas import DataFrame, date_range
from datetime import datetime, timedelta
Create data
np.random.seed(111)
Function to generate test data
def CreateDataSet(Number=1):
Output = []
for i in range(Number):
# Create a date range with hour frequency
date = date_range(start='10/1/2012', end='10/31/2012', freq='H')
# Create long lat data
laty = npr.normal(4815862, 5000,size=len(date))
longx = npr.normal(687993, 5000,size=len(date))
# status of interest
status = [0,1]
# Make a random list of statuses
random_status = [status[npr.randint(low=0,high=len(status))] for i in range(len(date))]
# user pool
user = ['sally','derik','james','bob','ryan','chris']
# Make a random list of users
random_user = [user[npr.randint(low=0,high=len(user))] for i in range(len(date))]
Output.extend(zip(random_user, random_status, date, longx, laty))
return pd.DataFrame(Output, columns = ['user', 'status', 'date', 'long', 'lat'])
#Create data
data = CreateDataSet(3)
len(data)
#some time deltas
before = timedelta(hours = 8)
after = timedelta(minutes = 1)
Function to speed up
def work(df):
output = []
#loop through data index's
for i in range(0, len(df)):
l = []
#first we will filter out the data by date to have a smaller list to compute distances for
#create a mask to query all dates between range for date i
date_mask = (df['date'] >= df['date'].iloc[i]-before) & (df['date'] <= df['date'].iloc[i]+after)
#create a mask to query all users who are not user i (themselves)
user_mask = df['user']!=df['user'].iloc[i]
#apply masks
dists_to_check = df[date_mask & user_mask]
#for point i, create coordinate to calculate distances from
a = np.array((df['long'].iloc[i], df['lat'].iloc[i]))
#create array of distances to check on the masked data
b = np.array((dists_to_check['long'].values, dists_to_check['lat'].values))
#for j in the date queried data
for j in range(1, len(dists_to_check)):
#compute the ueclidean distance between point a and each point of b (the date masked data)
x = np.linalg.norm(a-np.array((b[0][j], b[1][j])))
#if the distance is within our range of interest append the index to a list
if x <=100:
l.append(j)
else:
pass
try:
#use the list of desired index's 'l' to query a final subset of the data
data = dists_to_check.iloc[l]
#summarize the column of interest then append to output list
output.append(data['status'].sum())
except IndexError, e:
output.append(0)
#print "There were no data to add"
return pd.DataFrame(output)
Run code and time it
start = datetime.now()
out = work(data)
print datetime.now() - start
Is there a way to do this query in a vectorized way? Or should I be chasing another technique.
<3
Here is what at least somewhat solves my problem. Since the loop can operate on different parts of the data independently, parallelization makes sense here.
using Ipython...
from IPython.parallel import Client
cli = Client()
cli.ids
cli = Client()
dview=cli[:]
with dview.sync_imports():
import numpy as np
import os
from datetime import timedelta
import pandas as pd
#We also need to add the time deltas and output list into the function as
#local variables as well as add the Ipython.parallel decorator
#dview.parallel(block=True)
def work(df):
before = timedelta(hours = 8)
after = timedelta(minutes = 1)
output = []
final time 1:17:54.910206, about 1/4 original time
I would still be very interested for anyone to suggest small speed improvements within the body of the function.

Python, parsing data 24 hours at a time out of 263 days

I have an excel/( to be converted to CSV a link) file.
The data- , has 8 columns. The first two are day of the year and time respectively while two before the last are minimum temperature and maximum temperature. For each day I need to find the maximum and minimum of the day subtract and save the value for that day.
Two problems I ran into, how do I parse 24 lines at a time ( there are no missing data lines!) and in each batch find the maximum or minimum.
I have 63126 lines=24 hr*263 days
So to iterate through the lines;
import numpy as np
input_temps='/L7_HW_SASP_w1112.csv'
up_air_min=np.genfromtxt(input_temps,skip_header=1, dtype=float, delimiter=',',usecols=(5))
up_air_max=np.genfromtxt(input_temps,skip_header=1, dtype=float, delimiter=',',usecols=(6))
day_year=np.genfromtxt(input_temps,skip_header=1, dtype=float, delimiter=',',usecols=(0))
dt_per_all_days=[]
for i in range (0,63126,1):
# I get stuck here how to limit the iteration for 24 at a time.
# if I can do that I think I can get the rest done.
min_d=[]
max_d=[]
min_d.append( up_air_min[i])
max_d.append( up_air_max[i])
max_per_day=max(max_d)
min_per_day=min(min_d)
dt_d=max_per_day-min_per_day
dt_per_all_days.append(dt_d)
del(min_d)
del(max_d)
move to the next batch of 24 lines....
`
Use the Numpy, Luke, avoid for-loops.
Then you have ap_air_min and ap_air_max numpy arrays you can easily do what you want by using numpy element-wise functions.
At first, create 2d array with 263 rows (one for a day) and 24 columns like this:
min_matrix = up_air_min.reshape((263, 24))
max_matrix = up_air_max.reshape((263, 24))
Then use np.min and np.max functions along axis 1 (good array tip sheet):
min_temperature = np.min(min_matrix, axis=1)
max_temperature = mp.max(max_matrix, axis=1)
And find the difference:
dt = max_temperature - min_temperature
dt is array with needed values. Let's save it to foo.csv:
np.savetxt('foo.csv', np.swapaxes([day_year, dt], 0, 1), delimiter=',')
And final code looks like this:
import numpy as np
# This I got from your answer.
input_temps='/L7_HW_SASP_w1112.csv'
up_air_min=np.genfromtxt(input_temps,skip_header=1, dtype=float, delimiter=',',usecols=(5))
up_air_max=np.genfromtxt(input_temps,skip_header=1, dtype=float, delimiter=',',usecols=(6))
day_year=np.genfromtxt(input_temps,skip_header=1, dtype=float, delimiter=',',usecols=(0))
# Split arrays and create matrix with 263 lines-days and 24 values in every line.
min_matrix = up_air_min.reshape((263, 24))
max_matrix = up_air_max.reshape((263, 24))
# Find min temperature for every day. min_temperature is an array with 263 values.
min_temperature = np.min(min_matrix, axis=1)
# The same for max temperature.
max_temperature = mp.max(max_matrix, axis=1)
# Subtract min temperature from max.
dt = max_temperature - min_temperature
# Save result in csv.
np.savetxt('foo.csv', np.swapaxes([day_year, dt], 0, 1), delimiter=',')
A reasonably pythonic way to do this would be to have a function that loops over the rows, gathering them up and spitting out the gathered rows using yield when the day changes. This gives you a generator that generates 263 lists each holding 24 values, which is a bit easier to process.
If you've definitely not got any missing values, you could use a trivial doubly-nested loop without batching up the elements first. That's a bit more fragile, but it sounds like you might not be planning to re-use the code anyway.
Here's a somewhat contrived example of how you could chunk things by 24 lines at a time.
from StringIO import StringIO
from random import random as r
import numpy as np
import operator
s = StringIO()
for x in xrange(0,10000):
s.write('%f,%f,%f\n' % (r(),r()*10,r()*100))
s.seek(0)
data = np.genfromtxt(s,dtype=None, names=['pitch','yaw','thrust'], delimiter=',')
for x in range(0,len(data),24):
print('Acting on hours %d through %d' % (x, x+24))
one_day = data[x:x+24]
minimum_yaw = min(one_day['yaw'])
max_yaw = max(one_day['yaw'])
print 'min',minimum_yaw,'max',max_yaw,'one_day',one_day['yaw']

Categories