Constraint Mismatch Error - python

I am running the following code to create a simple line graph:
import matplotlib.pyplot as plt
import iris
import iris.coord_categorisation as iriscc
import iris.plot as iplt
import iris.quickplot as qplt
import iris.analysis.cartography
import matplotlib.dates as mdates
def main():
#bring in all the files we need and give them a name
TestFile= '/exports/csce/datastore/geos/users/s0XXXX/Climate_Modelling/AFR_44_tas/Historical/1950-2005/tas_AFR-44_MOHC-HadGEM2-ES_historical_r1i1p1_CLMcom-CCLM4-8-17_v1_mon_194912-200512.nc'
#Load exactly one cube from given file
TestFile = iris.load_cube(TestFile)
print TestFile
#adjust longitude as data is out by 180degrees
#remove flat latitude and longitude and only use grid latitude and grid longitude which are in the 3rd and 4th column of the file
lats = iris.coords.DimCoord(TestFile.coords()[3].points[:,0], \
standard_name='latitude', units='degrees')
lons = TestFile.coords()[4].points[0]
for i in range(len(lons)):
if lons[i]>100.:
lons[i] = lons[i]-360.
lons = iris.coords.DimCoord(lons, \
standard_name='longitude', units='degrees')
TestFile.remove_coord('latitude')
TestFile.remove_coord('longitude')
TestFile.remove_coord('grid_latitude')
TestFile.remove_coord('grid_longitude')
TestFile.add_dim_coord(lats, 1)
TestFile.add_dim_coord(lons, 2)
#we are only interested in the latitude and longitude relevant to Malawi
Malawi = iris.Constraint(longitude=lambda v: 32.5 <= v <= 36., \
latitude=lambda v: -17. <= v <= -9.)
TestFile = TestFile.extract(Malawi)
#data is in Kelvin, but we would like to show it in Celcius
TestFile.convert_units('Celsius')
#We are interested in plotting the graph with time along the x ais, so we need a mean of all the coordinates, i.e. mean temperature across whole country
iriscc.add_year(TestFile, 'time')
TestFile = TestFile.aggregated_by('year', iris.analysis.MEAN)
TestFile.coord('latitude').guess_bounds()
TestFile.coord('longitude').guess_bounds()
TestFile_grid_areas = iris.analysis.cartography.area_weights(TestFile)
TestFile_mean = TestFile.collapsed(['latitude', 'longitude'],
iris.analysis.MEAN,
weights=TestFile_grid_areas)
#set major plot indicators for x-axis
plt.gca().xaxis.set_major_locator(mdates.YearLocator(5))
#assign the line colours
qplt.plot(TestFile_mean, label='TestFile', lw=1.5, color='blue')
#create a legend and set its location to under the graph
plt.legend(loc="upper center", bbox_to_anchor=(0.5,-0.05), fancybox=True, shadow=True, ncol=5)
#create a title
plt.title('Mean Near Surface Temperature for Malawi', fontsize=11)
#create the graph
plt.grid()
iplt.show()
if __name__ == '__main__':
main()
This is working well for the majority of the files, but two climate models, are coming up with Constraint Mismatch Errors:
runfile('/exports/csce/datastore/geos/users/s0XXXX/Climate_Modelling/Python Code and Output Images/Line_Graph_Temp_Test.py', wdir='/exports/csce/datastore/geos/users/s0XXXX/Climate_Modelling/Python Code and Output Images')
Traceback (most recent call last):
File "<ipython-input-83-4f4457568a8f>", line 1, in <module> runfile('/exports/csce/datastore/geos/users/s0XXXX/Climate_Modelling/Python Code and Output Images/Line_Graph_Temp_Test.py', wdir='/exports/csce/datastore/geos/users/s0XXXX/Climate_Modelling/Python Code and Output Images')
File "/usr/lib/python2.7/site-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 685, in runfile
execfile(filename, namespace)
File "/usr/lib/python2.7/site-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 78, in execfile
builtins.execfile(filename, *where)
File "/exports/csce/datastore/geos/users/s0XXXX/Climate_Modelling/Python Code and Output Images/Line_Graph_Temp_Test.py", line 84, in <module>
main()
File "/exports/csce/datastore/geos/users/s0XXXX/Climate_Modelling/Python Code and Output Images/Line_Graph_Temp_Test.py", line 21, in main
TestFile = iris.load_cube(TestFile)
File "/usr/lib64/python2.7/site-packages/iris/__init__.py", line 338, in load_cube
raise iris.exceptions.ConstraintMismatchError(str(e))
ConstraintMismatchError: failed to merge into a single cube.
cube.standard_name differs: None != u'air_temperature'
cube.long_name differs: None != u'Near-Surface Air Temperature'
cube.var_name differs: u'rotated_pole' != u'tas'
cube.units differs: Unit('no_unit') != Unit('K')
cube.attributes keys differ: 'grid_north_pole_latitude', 'grid_north_pole_longitude', 'grid_mapping_name'
cube.cell_methods differ
cube.shape differs: () != (660, 201, 194)
cube data dtype differs: |S1 != float32
cube data fill_value differs: '\x00' != 1e+20
Similarly, I get this error when trying to run the observed data (cru_ts4.00.1901.2015.tmp.dat.nc)
ConstraintMismatchError: failed to merge into a single cube.
cube.long_name differs: u'near-surface temperature' != None
cube.var_name differs: u'tmp' != u'stn'
cube.units differs: Unit('degrees Celsius') != Unit('1')
cube.attributes keys differ: 'correlation_decay_distance', 'description'
cube data dtype differs: float32 != int32
cube data fill_value differs: 9.96921e+36 != -2147483647
Any ideas on how I can fix this?

I received a response from Andrew Dawson on the Iris User Google Group. Posting here in case it is of any help to someone else. This helped me!
The function iris.load_cube is used to load exactly 1 and only 1 cube from the given file matching the given constraints. You haven't provided constraints which means you are expecting the file(s) your a loading from to reduce to exactly 1 cube. The ConstraintMismatchError from iris.load_cube is telling you that this is not possible due to some mismatched data. From the error it looks like you have more than 1 variable in your input file(s) for those models. You should consider adding an explicit constraint when loading, perhaps like:
iris.load_cube(filename, 'name_of_variable_here')
where the name_of_variable should be the name that cube would be loaded with, i.e. the result of cube.name(). This is different from the netcdf variable name. To work out how you need to do this I suggest loading all the cubes from one of the problematic datasets with
cubes = iris.load(the_filename) # load all the cubes in the input file
and then printing the names of the cubes
for cube in cubes:
print(cube.name())

Related

Can't get correct input for DBSCAN clustersing

I have a node2vec embedding stored as a .csv file, values are a square symmetric matrix. I have two versions of this, one with node names in the first column and another with node names in the first row. I would like to cluster this data with DBSCAN, but I can't seem to figure out how to get the input right. I tried this:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
input_file = "node2vec-labels-on-columns.emb"
# for tab delimited use:
df = pd.read_csv(input_file, header = 0, delimiter = "\t")
# put the original column names in a python list
original_headers = list(df.columns.values)
emb = df.as_matrix()
db = DBSCAN(eps=0.3, min_samples=10).fit(emb)
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
This leads to an error:
dbscan.py:14: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
emb = df.as_matrix()
Traceback (most recent call last):
File "dbscan.py", line 15, in <module>
db = DBSCAN(eps=0.3, min_samples=10).fit(emb)
File "C:\Python36\lib\site-packages\sklearn\cluster\_dbscan.py", line 312, in fit
X = self._validate_data(X, accept_sparse='csr')
File "C:\Python36\lib\site-packages\sklearn\base.py", line 420, in _validate_data
X = check_array(X, **check_params)
File "C:\Python36\lib\site-packages\sklearn\utils\validation.py", line 73, in inner_f
return f(**kwargs)
File "C:\Python36\lib\site-packages\sklearn\utils\validation.py", line 646, in check_array
allow_nan=force_all_finite == 'allow-nan')
File "C:\Python36\lib\site-packages\sklearn\utils\validation.py", line 100, in _assert_all_finite
msg_dtype if msg_dtype is not None else X.dtype)
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
I've tried other input methods that lead to the same error. All the tutorials I can find use datasets imported form sklearn so those are of not help figuring out how to read from a file. Can anyone point me in the right direction?
The error does not come from the fact that you are reading the dataset from a file but on the content of the dataset.
DBSCAN is meant to be used on numerical data. As stated in the error, it does not support NaNs.
If you are willing to cluster strings or labels, you should find some other model.

Import Data in Python with Pandas just for specific rows

I am really new in Python and I hope this is the right community for my question. Sorry if it is not.
I am trying to import data from a .txt file with pandas.
The file looks like this:
# Raman Scattering Spectrum
# X-Axis: Frequency (cm-1)
# Y-Axis: Intensity (10-36 m2 cm/sr)
# Harmonic Data
# Peak information (Harmonic)
# X Y
# 20.1304976000 1.1465331676
# 25.5433266000 6.0306906544
...
# 3211.8081700000 0.3440113123
# 3224.5118500000 0.8814596030
# Plot Curve (Harmonic)
# X Y DY/DX
0.0000000000 8.4803414671 0.6546818124
8.0000000000 17.8239097502 2.0146387573
I already wrote this pieces of code to import my data:
import pandas as pd
# import matplotlib as plt
# import scipy as sp
data = pd.read_csv('/home/andrea/Schreibtisch/raman_gauss.txt', sep='\t')
data
Now I just get one column.
If I try it with
pd.read_fwf(file)
I got 3 columns, but the x and y values from plot curve (harmonic) are in one column.
Now I want to import from Plot Curve (Harmonic) the x, y and DY/DX values in different variables or containers as series.
The hart part for me ist how to split x und y now in 2 columns and how to tell python that the import should start at the line number from plot cuve (harmonix) +2 lines.
I think about it yet and my idea was to check all containers for the string 'Plot Curve (Harmonic). Then I get a new series with true or false. Then I need to read out which line number is true for the search word. And then I start the import from this line...
I am too much a newbie to Python and I am not yet familiar with the documantation that I found the command i must use.
Has anyone tipps for me with a command or something? And how to split the columns?
Thank you very much!
You can read as follows.
Code
import pandas as pd
import re # Regex to parse header
def get_data(filename):
# Find row containing 'Plot Curve (Harmonic)'
with open('data.txt', 'r') as f:
for i, line in enumerate(f):
if 'Plot Curve (Harmonic)' in line:
start_row = i
# Parse header on next line
header = re.findall(r'\S+', next(f))[1:]
# [1:] to skip '#' at beginnning of line
break
else:
start_row = None # not found
if start_row:
# Use delimiter=r"\s+": since have multiple spaces between numbers
# skip_rows = start_row+2: to skip to data
# (skip current and header row)
# reference: https://thispointer.com/pandas-skip-rows-while-reading-csv-file-to-a-dataframe-using-read_csv-in-python/
# names = header: assigns column names
df = pd.read_csv('data.txt', delimiter=r"\s+", skiprows=start_row+2,
names = header)
return df
Test
df = get_data('data.txt')
print(df)
data.txt file
# Raman Scattering Spectrum
# X-Axis: Frequency (cm-1)
# Y-Axis: Intensity (10-36 m2 cm/sr)
# Harmonic Data
# Peak information (Harmonic)
# X Y
# 20.1304976000 1.1465331676
# 25.5433266000 6.0306906544
...
# 3211.8081700000 0.3440113123
# 3224.5118500000 0.8814596030
# Plot Curve (Harmonic)
# X Y DY/DX
0.0000000000 8.4803414671 0.6546818124
8.0000000000 17.8239097502 2.0146387573
Output
X Y DY/DX
0 0.0 8.480341 0.654682
1 8.0 17.823910 2.014639
First: Thank you very much for your answer. It helps me a lot.
I tried to used the comment function but i cannot add an 'Enter'
I want to plot the data, I can now extract from the file, but when I add my standard plot code:
plt.plot(df.X, df.Y)
plt.legend(['simulated'])
plt.xlabel('raman_shift')
plt.ylabel('intensity')
plt.grid(True)
plt.show()
I get now the error:
TypeError Traceback (most recent call last)
<ipython-input-240-8594f8545868> in <module>
28 plt.plot(df.X, df.Y)
29 plt.legend(['simulated'])
---> 30 plt.xlabel('raman_shift')
31 plt.ylabel('intensity')
32 plt.grid(True)
TypeError: 'str' object is not callable
I have nothing changed at the label function. In my other project this lines work well.
And I dont know as well how do read out the DY/DX column, the '/' kann not be used in the columnname.
Did you got a tipp for me, again? :)
Thanks.

Length-1 Arrays and Python Scalars Via plt.text

I'm trying to use plt.text to plot temperature values at their associated lat/lon points on a plot.
After reviewing the plt.text documentation, it appears that the plotted value (third arg) has to be a number and that the number has to be a whole number, NOT a number with decimals.
Below is the code that I'm trying to work with and the associated traceback error that I'm receiving:
Script Code:
data = np.loadtxt('/.../.../.../tmax_day0', delimiter=',', skiprows=1)
grid_x, grid_y = np.mgrid[-85:64:dx, 34:49:dx]
temp = data[:,2]
#print temp
grid_z = griddata((data[:,1],data[:,0]), data[:,2], (grid_x,grid_y), method='linear')
x,y = m(data[:,1], data[:,0]) # flip lat/lon
grid_x,grid_y = m(grid_x,grid_y)
#m.plot(x,y, 'ko', markersize=2)
def str_to_float(str):
try:
number = float(str)
except ValueError:
number = 0.0
return number
fmt = str_to_float(temp)
#annotate point temperature on plot
plt.text(grid_x, grid_y, fmt, fontdict=None)
Traceback Error:
Traceback (most recent call last):
File "plotpoints.py", line 56, in <module>
fmt = str_to_float(temp)
File "plotpoints.py", line 51, in str_to_float
number = float(str)
TypeError: only length-1 arrays can be converted to Python scalars
Data sample from text file tmax_day0:
latitude,longitude,value
36.65408,-83.21783,90
41.00928,-74.73628,92.02
43.77714,-71.75598,90
44.41944,-72.01944,88.8
39.5803,-79.3394,79
38.3154,-76.5501,86
38.91444,-82.09833,94
40.64985,-75.44771,92.6
41.25389,-70.05972,81.2
39.45202,-74.56699,90.88
I was able to achieve plotting data values only by using the following code:
for i in range(len(temp)):
plt.text(x[i], y[i], temp[i], va="top", family="monospace")
Result:
You aren't using a "proper" array, and are instead using a numpy array. Numpy arrays don't play well with non-numpy functions.
Going from your comment, this has been edited.
You would first need to fix the string so it's a proper array.
fmt = fmt[0].split()
I think should work to create a new (normal) array of strings. And then this to map that to an array of floats:
list_of_floats = np.array(map(float, fmt))

Python: create multiple boxplots in one pannel

I have been using R for long time and I am recently learning Python.
I would like to create multiple box plots in one panel in Python.
My dataset is in a vector form and a label vector indicates which box plot each element of data corresponds. The example looks like this:
N = 50
data = np.random.lognormal(size=N, mean=1.5, sigma=1.75)
label = np.repeat([1,2,3,4,5],N/5)
From various websites (e.g., matplotlib: Group boxplots), Creating multiple boxplots requires a matrix object input whose column contains samples for one boxplot. So I created a list object based on data and label:
savelist = data[ label == 1]
for i in [2,3,4,5]:
savelist = [savelist, data[ label == i]]
However, the code below gives me an error:
boxplot(savelist)
Traceback (most recent call last):
File "<ipython-input-222-1a55d04981c4>", line 1, in <module>
boxplot(savelist)
File "/Users/yumik091186/anaconda/lib/python2.7/site-packages/matplotlib/pyplot.py", line 2636, in boxplot
meanprops=meanprops, manage_xticks=manage_xticks)
File "/Users/yumik091186/anaconda/lib/python2.7/site-packages/matplotlib/axes/_axes.py", line 3045, in boxplot labels=labels)
File "/Users/yumik091186/anaconda/lib/python2.7/site-packages/matplotlib/cbook.py", line 1962, in boxplot_stats
stats['mean'] = np.mean(x)
File "/Users/yumik091186/anaconda/lib/python2.7/site-packages/numpy/core/fromnumeric.py", line 2727, in mean
out=out, keepdims=keepdims)
File "/Users/yumik091186/anaconda/lib/python2.7/site-packages/numpy/core/_methods.py", line 66, in _mean
ret = umr_sum(arr, axis, dtype, out, keepdims)
ValueError: operands could not be broadcast together with shapes (2,) (10,)
Can anyone explain what is going on?
You're ending up with a nested list instead of a flat list. Try this instead:
savelist = [data[label == 1]]
for i in [2,3,4,5]:
savelist.append(data[label == i])
And it should work.

Python zero-size array to ufunc.reduce without identity

I'm trying to make a histogram of some data that is being stored in an ndarray. The histogram is part of a set of analysis which I've made into a class in a python program. The part of the code that isn't working is below.
def histogram(self, iters):
samples = T.MCMC(iters) #Returns an [iters,3,4] ndarray
histAC = plt.figure(self.ip) #plt is matplotlib's pyplot
self.ip+=1 #defined at the beginning of the class to start at 0
for l in range(0,4):
h = histAC.add_subplot(2,(iters+1)/2,l+1)
for i in range(0,0.5*self.chan_num):
intAvg = mean(samples[:,i,l])
print intAvg
for k in range(0,iters):
samples[k,i,l]=samples[k,i,l]-intAvg
print "Samples is ",samples
h.hist(samples,bins=5000,range=[-6e-9,6e-9],histtype='step')
h.legend(loc='upper right')
h.set_title("AC Pulse Integral Histograms: "+str(l))
figname = 'ACHistograms.png'
figpath = 'plot'+str(self.ip)
print "Finished!"
#plt.savefig(figpath + figname, format = 'png')
This gives me the following error message:
File "johnmcmc.py", line 257, in histogram
h.hist(samples,bins=5000,range=[-6e-9,6e-9],histtype='step') #removed label=apdlabel
File "/x/tsfit/local/lib/python2.6/site-packages/matplotlib/axes.py", line 7238, in hist
ymin = np.amin(m[m!=0]) # filter out the 0 height bins
File "/x/tsfit/local/lib/python2.6/site-packages/numpy/core/fromnumeric.py", line 1829, in amin
return amin(axis, out)
ValueError: zero-size array to ufunc.reduce without identity
The only search results I've found have been multiple copies of the same two conversations, from which the only thing I learned was that python histograms don't like getting fed empty arrays, which is why I added the print statement right above the line that's giving me trouble to make sure the array isn't empty.
Has anyone else come across this error before?

Categories