I am trying to understand the difference in performances between single and double precisions of our GPU workstation.
Our workstation is equipped with two TITAN RTX GPUs, but I am running the benchmark on a sigle Titan RTX.
I am testing the performance with cublas matrix-matrix multiplications. I multiply 8192x8192 matrices that consist of random floats or doubles. To ensure that there is no mistake on my end, I also repeat this procedure in Python using cupy library, and the results are very similar.
The test results are ~75 ms per 1 multiplication for floats and ~2,000 ms for doubles.
If I had an older GPU, this would make a lot of sense, as 75*32 = 2,400~2000, so that my double-precision performance would be ~32 times poorer as expected from the table https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions.
However, my GPU has Compute Capability 7.5, therefore I expect degradation of the performance with doubles only by a factor of 2.
Other info: Ubuntu 18 LTS, nvcc 10.2, driver 440.82.
Here is the CUDA code:
#include <iostream>
#include <chrono>
#include <string>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include <device_functions.h>
#include <sstream>
#include <time.h>
unsigned long mix(unsigned long a, unsigned long b, unsigned long c)
{
a=a-b; a=a-c; a=a^(c >> 13);
b=b-c; b=b-a; b=b^(a << 8);
c=c-a; c=c-b; c=c^(b >> 13);
a=a-b; a=a-c; a=a^(c >> 12);
b=b-c; b=b-a; b=b^(a << 16);
c=c-a; c=c-b; c=c^(b >> 5);
a=a-b; a=a-c; a=a^(c >> 3);
b=b-c; b=b-a; b=b^(a << 10);
c=c-a; c=c-b; c=c^(b >> 15);
return c;
}
using namespace std;
int main()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
cudaDeviceProp deviceProp;
cublasStatus_t err;
cudaGetDeviceProperties(&deviceProp, 0);
printf("Detected %d devices \n", deviceCount);
printf("Device %d has compute capability %d.%d:\n\t maxshmem %d. \n\t maxthreads per block %d. \n\t max threads dim %d. %d. %d.\n ", 0,
deviceProp.major, deviceProp.minor, deviceProp.sharedMemPerBlock, deviceProp.maxThreadsPerBlock, deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
cudaEvent_t start_d, stop_d;
cudaEventCreate(&start_d);
cudaEventCreate(&stop_d);
//RND insicialization
unsigned long seed = mix(clock(), time(NULL), 0);
srand(seed);
int N=8192;
int Nloops=2;
int memsize=N*N*sizeof(double);
double *a = (double *)malloc(memsize);
double *b = (double *)malloc(memsize);
double *c = (double *)malloc(memsize);
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++){
a[i*N+j]=((double)rand() / RAND_MAX);
b[i*N+j]=((double)rand() / RAND_MAX);
}
double *a_d, *b_d, *c_d;
cudaMalloc((void **)&a_d, memsize);
cudaMalloc((void **)&b_d, memsize);
cudaMalloc((void **)&c_d, memsize);
cudaMemcpy(a_d, a, memsize, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b, memsize, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha=1.0;
double beta=0.0;
auto start = chrono::steady_clock::now();
clock_t start1;
start1 = clock();
cudaEventRecord(start_d);
if (cudaGetLastError() != cudaSuccess)
printf("%s \n",cudaGetErrorString(cudaGetLastError()));
for (int i=0; i<Nloops; i++)
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N,N,N,&alpha,a_d,N,b_d,N,&beta,c_d,N);
cudaEventRecord(stop_d);
cudaDeviceSynchronize();
auto end = chrono::steady_clock::now();
start1 = clock() - start1;
cudaEventSynchronize(stop_d);
cublasDestroy(handle);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start_d, stop_d);
std::cout << "Cuda event " << milliseconds /Nloops << " ms" <<endl;
std::cout << " time elapsed " << start1 / (double)CLOCKS_PER_SEC /Nloops << '\n';
cout << "time elapsed for 1 multiplication: " << ((double)chrono::duration_cast<chrono::microseconds>(end-start).count() )/(Nloops*1000.0)<< " milliseconds" <<endl;
free(a); free(b); free(c);
cudaFree(a_d); cudaFree(b_d); cudaFree(c_d);
}
And this is the python code that yields consistent results:
import cupy as cp
import time
iterations = 2
a = cp.random.rand(8192,8192).astype(cp.float64)
b = cp.random.rand(8192,8192).astype(cp.float64)
def ab(a,b,iterations):
for i in range(iterations):
cp.matmul(a,b,out=None)
ab(a,b,1) # warm up
cp.cuda.Device(0).synchronize()
t1 = time.time()
ab(a,b,iterations)
cp.cuda.Device(0).synchronize()
t2 = time.time()
total = (t2-t1)/iterations
print(total)
Ok, I found the answer. In that table that I link in my quesiton, there is a footnote that says that for compute capability 7.5 (which is the case here) the performance is 2, but not 32, and for floats it is 64, which means that multiplication-addition operations for doubles are 32 times slower than for the floats.
If both the float and double problems were fully arithmetic-bound, I would expect the slowdown to be ~32. In reality, the slowdown is slightly smaller (2000/75 ~ 27), which may be a consequence of the problem with floats being bandwidth-bound, or maybe it is related to other things.
Related
I've created some data in numpy that I would like to use in a separate C++ program. Therefore I need to save the data using python and later load it in C++. What is the best way of doing this?
My numpy ndarray is float 32 and of shape [10000 x 18 x 5]. I can save it for example using
numpy.save(filename, data)
Is there an easy way to load such data in C++? Target structure could be an Eigen::Matrix for example.
After searching for hours I found my year-old example files.
Caveat:
solution only covers 2D matrices
not suited for 3 dimensional or generic ndarrays
Write numpy array to ascii file with header specifying nrows, ncols:
def write_matrix2D_to_ascii(filename, matrix2D):
nrows, ncols = matrix2D.shape
with open(filename, "w") as file:
# write header [rows x cols]
nrows, ncols = matrix2D.shape
file.write(f"{nrows} {ncols}")
file.write("\n")
# write values
for row in range(nrows):
for col in range(ncols):
value = matrix2D[row, col]
file.write(str(value))
file.write(" ")
file.write("\n")
Example output data-file.txt looks like this (first row is header specifying nrows and ncols):
2 3
1.0 2.0 3.0
4.0 5.0 6.0
Cpp function to read matrix from ascii file into OpenCV matrix:
#include <iostream>
#include <fstream>
#include <iomanip> // set precision of output string
#include <opencv2/core/core.hpp> // OpenCV matrices for storing data
using namespace std;
using namespace cv;
void readMatAsciiWithHeader( const string& filename, Mat& matData)
{
cout << "Create matrix from file :" << filename << endl;
ifstream inFileStream(filename.c_str());
if(!inFileStream){
cout << "File cannot be found" << endl;
exit(-1);
}
int rows, cols;
inFileStream >> rows;
inFileStream >> cols;
matData.create(rows,cols,CV_32F);
cout << "numRows: " << rows << "\t numCols: " << cols << endl;
matData.setTo(0); // init all values to 0
float *dptr;
for(int ridx=0; ridx < matData.rows; ++ridx){
dptr = matData.ptr<float>(ridx);
for(int cidx=0; cidx < matData.cols; ++cidx, ++dptr){
inFileStream >> *dptr;
}
}
inFileStream.close();
}
Driver code to use above mentioned function in cpp program:
Mat myMatrix;
readMatAsciiWithHeader("path/to/data-file.txt", myMatrix);
For completeness, some code to save the data using C++:
int saveMatAsciiWithHeader( const string& filename, Mat& matData)
{
if (matData.empty()){
cout << "File could not be saved. MatData is empty" << endl;
return 0;
}
ofstream oStream(filename.c_str());
// Create header
oStream << matData.rows << " " << matData.cols << endl;
// Write data
for(int ridx=0; ridx < matData.rows; ridx++)
{
for(int cidx=0; cidx < matData.cols; cidx++)
{
oStream << setprecision(9) << matData.at<float>(ridx,cidx) << " ";
}
oStream << endl;
}
oStream.close();
cout << "Saved " << filename.c_str() << endl;
return 1;
}
Future work:
solution for 3D matrices
conversion to Eigen::Matrix
I would like to duplicate in C++ the testing for some code that has already been implemented in Python3 which relies on numpy.random.rand and randn values and a specific seed (e.g., seed = 1).
I understand that Python's random implementation is based on a Mersenne twister. The C++ standard library also supplies this in std::mersenne_twister_engine.
The C++ version returns an unsigned int, whereas Python rand is a floating point value.
Is there a way to obtain the same values in C++ as are generated in Python, and be sure that they are the same? And the same for an array generated by randn ?
You can do it this way for integer values:
import numpy as np
np.random.seed(12345)
print(np.random.randint(256**4, dtype='<u4', size=1)[0])
#include <iostream>
#include <random>
int main()
{
std::mt19937 e2(12345);
std::cout << e2() << std::endl;
}
The result of both snippets is 3992670690
By looking at source code of rand you can implement it in your C++ code this way:
import numpy as np
np.random.seed(12345)
print(np.random.rand())
#include <iostream>
#include <iomanip>
#include <random>
int main()
{
std::mt19937 e2(12345);
int a = e2() >> 5;
int b = e2() >> 6;
double value = (a * 67108864.0 + b) / 9007199254740992.0;
std::cout << std::fixed << std::setprecision(16) << value << std::endl;
}
Both random values are 0.9296160928171479
It would be convenient to use std::generate_canonical, but it uses another method to convert the output of Mersenne twister to double. The reason they differ is likely that generate_canonical is more optimized than the random generator used in NumPy, as it avoids costly floating point operations, especially multiplication and division, as seen in source code. However it seems to be implementation dependent, while NumPy produces the same result on all platforms.
double value = std::generate_canonical<double, std::numeric_limits<double>::digits>(e2);
This doesn't work and produces result 0.8901547132827379, which differs from the output of Python code.
For completeness and to avoid re-inventing the wheel, here is an implementation for both numpy.rand and numpy.randn in C++
The header file:
#ifndef RANDOMNUMGEN_NUMPYCOMPATIBLE_H
#define RANDOMNUMGEN_NUMPYCOMPATIBLE_H
#include "RandomNumGenerator.h"
//Uniform distribution - numpy.rand
class RandomNumGen_NumpyCompatible {
public:
RandomNumGen_NumpyCompatible();
RandomNumGen_NumpyCompatible(std::uint_fast32_t newSeed);
std::uint_fast32_t min() const { return m_mersenneEngine.min(); }
std::uint_fast32_t max() const { return m_mersenneEngine.max(); }
void seed(std::uint_fast32_t seed);
void discard(unsigned long long); // NOTE!! Advances and discards twice as many values as passed in to keep tracking with Numpy order
uint_fast32_t operator()(); //Simply returns the next Mersenne value from the engine
double getDouble(); //Calculates the next uniformly random double as numpy.rand does
std::string getGeneratorType() const { return "RandomNumGen_NumpyCompatible"; }
private:
std::mt19937 m_mersenneEngine;
};
///////////////////
//Gaussian distribution - numpy.randn
class GaussianRandomNumGen_NumpyCompatible {
public:
GaussianRandomNumGen_NumpyCompatible();
GaussianRandomNumGen_NumpyCompatible(std::uint_fast32_t newSeed);
std::uint_fast32_t min() const { return m_mersenneEngine.min(); }
std::uint_fast32_t max() const { return m_mersenneEngine.max(); }
void seed(std::uint_fast32_t seed);
void discard(unsigned long long); // NOTE!! Advances and discards twice as many values as passed in to keep tracking with Numpy order
uint_fast32_t operator()(); //Simply returns the next Mersenne value from the engine
double getDouble(); //Calculates the next normally (Gaussian) distrubuted random double as numpy.randn does
std::string getGeneratorType() const { return "GaussianRandomNumGen_NumpyCompatible"; }
private:
bool m_haveNextVal;
double m_nextVal;
std::mt19937 m_mersenneEngine;
};
#endif
And the implementation:
#include "RandomNumGen_NumpyCompatible.h"
RandomNumGen_NumpyCompatible::RandomNumGen_NumpyCompatible()
{
}
RandomNumGen_NumpyCompatible::RandomNumGen_NumpyCompatible(std::uint_fast32_t seed)
: m_mersenneEngine(seed)
{
}
void RandomNumGen_NumpyCompatible::seed(std::uint_fast32_t newSeed)
{
m_mersenneEngine.seed(newSeed);
}
void RandomNumGen_NumpyCompatible::discard(unsigned long long z)
{
//Advances and discards TWICE as many values to keep with Numpy order
m_mersenneEngine.discard(2*z);
}
std::uint_fast32_t RandomNumGen_NumpyCompatible::operator()()
{
return m_mersenneEngine();
}
double RandomNumGen_NumpyCompatible::getDouble()
{
int a = m_mersenneEngine() >> 5;
int b = m_mersenneEngine() >> 6;
return (a * 67108864.0 + b) / 9007199254740992.0;
}
///////////////////
GaussianRandomNumGen_NumpyCompatible::GaussianRandomNumGen_NumpyCompatible()
: m_haveNextVal(false)
{
}
GaussianRandomNumGen_NumpyCompatible::GaussianRandomNumGen_NumpyCompatible(std::uint_fast32_t seed)
: m_haveNextVal(false), m_mersenneEngine(seed)
{
}
void GaussianRandomNumGen_NumpyCompatible::seed(std::uint_fast32_t newSeed)
{
m_mersenneEngine.seed(newSeed);
}
void GaussianRandomNumGen_NumpyCompatible::discard(unsigned long long z)
{
//Burn some CPU cyles here
for (unsigned i = 0; i < z; ++i)
getDouble();
}
std::uint_fast32_t GaussianRandomNumGen_NumpyCompatible::operator()()
{
return m_mersenneEngine();
}
double GaussianRandomNumGen_NumpyCompatible::getDouble()
{
if (m_haveNextVal) {
m_haveNextVal = false;
return m_nextVal;
}
double f, x1, x2, r2;
do {
int a1 = m_mersenneEngine() >> 5;
int b1 = m_mersenneEngine() >> 6;
int a2 = m_mersenneEngine() >> 5;
int b2 = m_mersenneEngine() >> 6;
x1 = 2.0 * ((a1 * 67108864.0 + b1) / 9007199254740992.0) - 1.0;
x2 = 2.0 * ((a2 * 67108864.0 + b2) / 9007199254740992.0) - 1.0;
r2 = x1 * x1 + x2 * x2;
} while (r2 >= 1.0 || r2 == 0.0);
/* Box-Muller transform */
f = sqrt(-2.0 * log(r2) / r2);
m_haveNextVal = true;
m_nextVal = f * x1;
return f * x2;
}
After doing a bit of testing, it does seem that the values are within a tolerance (see #fdermishin 's comment below) when the C++ unsigned int is divided by the maximum value for an unsigned int like this:
#include <limits>
...
std::mt19937 generator1(seed); // mt19937 is a standard mersenne_twister_engine
unsigned val1 = generator1();
std::cout << "Gen 1 random value: " << val1 << std::endl;
std::cout << "Normalized Gen 1: " << static_cast<double>(val1) / std::numeric_limits<std::uint32_t>::max() << std::endl;
However, Python's version seems to skip every other value.
Given the following two programs:
#!/usr/bin/env python3
import numpy as np
def main():
np.random.seed(1)
for i in range(0, 10):
print(np.random.rand())
###########
# Call main and exit success
if __name__ == "__main__":
main()
sys.exit()
and
#include <cstdlib>
#include <iostream>
#include <random>
#include <limits>
int main()
{
unsigned seed = 1;
std::mt19937 generator1(seed); // mt19937 is a standard mersenne_twister_engine
for (unsigned i = 0; i < 10; ++i) {
unsigned val1 = generator1();
std::cout << "Normalized, #" << i << ": " << (static_cast<double>(val1) / std::numeric_limits<std::uint32_t>::max()) << std::endl;
}
return EXIT_SUCCESS;
}
the Python program prints:
0.417022004702574
0.7203244934421581
0.00011437481734488664
0.30233257263183977
0.14675589081711304
0.0923385947687978
0.1862602113776709
0.34556072704304774
0.39676747423066994
0.538816734003357
whereas the C++ program prints:
Normalized, #0: 0.417022
Normalized, #1: 0.997185
Normalized, #2: 0.720324
Normalized, #3: 0.932557
Normalized, #4: 0.000114381
Normalized, #5: 0.128124
Normalized, #6: 0.302333
Normalized, #7: 0.999041
Normalized, #8: 0.146756
Normalized, #9: 0.236089
I could easily skip every other value in the C++ version, which should give me numbers that match the Python version (within a tolerance). But why would Python's implementation seem to skip every other value, or where do these extra values in the C++ version come from?
I had some code in Python3 (with numpy) that I wanted to convert to C++ (with eigen3) in order to get a more efficient program. So I decided to test a simple example to assess the performance gain I would get. The code consists on two random arrays that are to be multiplied coefficient-wise. My conclusions were that the python code with numpy is about 30% faster than the one in C++. I'd like to know why the interpreted python code is faster than a compiled C++ code. Am I missing something in the C++ code?
I'm using gcc 9.1.0, Eigen 3.3.7, Python 3.7.3 and Numpy 1.16.4.
Possible explanations:
C++ program isn't using vectorization
Numpy is a lot more optimized than I thought
Time is measuring different things in each program
There is a similar question in Stack Overflow (Eigen Matrix vs Numpy Array multiplication performance). I tested this in my computer and got the expected result that eigen is more efficient than numpy, but the operation here is matrix multiplication rather than coefficient-wise multiplication.
Python code (main.py)
Execution command: python3 main.py
import numpy as np
import time
Lx = 4096
Ly = 4000
# Filling arrays
a = np.random.rand(Lx, Ly).astype(np.float64)
a1 = np.random.rand(Lx, Ly).astype(np.float64)
# Coefficient-wise product
start = time.time()
b = a*a1
# Compute the elapsed time
end = time.time()
print(b.sum())
print("duration: ", end-start)
C++ code with eigen3 (main_eigen.cpp)
Compilation command: g++ -O3 -I/usr/include/eigen3/ main_eigen.cpp -o prog_eigen
#include <iostream>
#include <chrono>
#include "Eigen/Dense"
#define Lx 4096
#define Ly 4000
typedef double T;
int main(){
// Allocating arrays
Eigen::Array<T, -1, -1> KPM_ghosts(Lx, Ly), KPM_ghosts1(Lx, Ly), b(Lx,Ly);
// Filling the arrays
KPM_ghosts.setRandom();
KPM_ghosts1.setRandom();
// Coefficient-wise product
auto start = std::chrono::system_clock::now();
b = KPM_ghosts*KPM_ghosts1;
// Compute the elapsed time
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end-start;
std::cout << "elapsed time: " << elapsed_seconds.count() << "s\n";
// Print the sum so the compiler doesn't optimize the code away
std::cout << b.sum() << "\n";
return 0;
}
Plain C++ code (main.cpp)
Compilation command: g++ -O3 main.cpp -o prog
#include <iostream>
#include <chrono>
#define Lx 4096
#define Ly 4000
#define N Lx*Ly
typedef double T;
int main(){
// Allocating arrays
T lin_vector1[N];
T lin_vector2[N];
T lin_vector3[N];
// Filling the arrays
for(unsigned i = 0; i < N; i++){
lin_vector1[i] = std::rand()*1.0/RAND_MAX;
lin_vector2[i] = std::rand()*1.0/RAND_MAX;
}
// Coefficient-wise product
auto start = std::chrono::system_clock::now();
for(unsigned i = 0; i < N; i++)
lin_vector3[i] = lin_vector1[i]*lin_vector2[i];
// Compute the elapsed time
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end-start;
std::cout << "elapsed time: " << elapsed_seconds.count() << "s\n";
// Print the sum so the compiler doesn't optimize the code away
double sum = 0;
for(unsigned i = 0; i < N; i++)
sum += lin_vector3[i];
std::cout << "sum: " << sum << "\n";
return 0;
}
Runtime of each program 10 times
Plain C++
elapsed time: 0.210664s
elapsed time: 0.215406s
elapsed time: 0.222483s
elapsed time: 0.21526s
elapsed time: 0.216346s
elapsed time: 0.218951s
elapsed time: 0.21587s
elapsed time: 0.213639s
elapsed time: 0.219399s
elapsed time: 0.213403s
Plain C++ with eigen3
elapsed time: 0.21052s
elapsed time: 0.220779s
elapsed time: 0.216269s
elapsed time: 0.229234s
elapsed time: 0.212265s
elapsed time: 0.256714s
elapsed time: 0.212396s
elapsed time: 0.248241s
elapsed time: 0.241537s
elapsed time: 0.323519s
Python
duration: 0.23946428298950195
duration: 0.1663036346435547
duration: 0.17225909233093262
duration: 0.15922021865844727
duration: 0.16628384590148926
duration: 0.15654635429382324
duration: 0.15859222412109375
duration: 0.1633443832397461
duration: 0.1685199737548828
duration: 0.16393446922302246
I've coded my classifer in libsvm's python utility, and it has worked quite well so far. Here is an example of how I call my Python API:
print svmutil.svm_predict([2], [f.flatten().tolist()], libsvm_model, '-b 1')
where f is a (1024,1) vector.
I have saved the model, and loaded it using the C++ API. However, when I attempt to load and predict the same vector, it gives me wrong results.
cv::Mat oneCol = fcMat.row(0);
svm_node *x = (struct svm_node *) malloc(1025*sizeof(struct svm_node));
for(int i=0; i<1024; i++){
x[i].index = i;
x[i].value = (double)oneCol.at<float>(i);
}
x[1024].index = -1;
double *prob_estimates=NULL;
prob_estimates = (double *) malloc(svmModel->nr_class*sizeof(double));
double retVal = svm_predict_probability(svmModel, x, prob_estimates);
cout << retVal << endl;
for(int j=0;j<svmModel->nr_class;j++)
cout << prob_estimates[j] << endl;
Over here, I attempt to load a vector in from an OpenCV object as such. However, the predicted model comes out wrong. Is something wrong here?
for(int i=0; i<1024; i++){
x[i].index = i+1;
x[i].value = (double)oneCol.at<float>(i);
}
In LibSVM, indexes start at 1. Who knew :(
I want to write a 2D vector of floats to a HDF5 file.
I used the following code (writeh5.cpp):
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <string>
#include <vector>
#include <iterator>
#include <H5Cpp.h>
using namespace H5;
using namespace std;
int main(void) {
int nrow = 5;
int ncol = 4;
vector<vector< double > > vec2d;
vec2d.resize(nrow, vector<double>(ncol, 0.0));
srand((unsigned)time(0));
typename vector< vector< double > >::iterator row;
typename vector< double >::iterator col;
for (row = vec2d.begin(); row != vec2d.end(); row++) {
cout << endl;
for (col = row->begin(); col != row->end(); col++) {
*col = (rand()/(RAND_MAX+1.0));
cout << *col << '\t';
}
}
cout << endl;
H5File file("test.h5", H5F_ACC_TRUNC);
// dataset dimensions
hsize_t dimsf[2];
dimsf[0] = nrow;
dimsf[1] = ncol;
DataSpace dataspace(2, dimsf);
DataType datatype(H5::PredType::NATIVE_DOUBLE);
DataSet dataset = file.createDataSet("data", datatype, dataspace);
// dataset.write(vec2d.data(), H5::PredType::NATIVE_DOUBLE);
dataset.write(&vec2d[0][0], H5::PredType::NATIVE_DOUBLE);
cout << endl << " vec2d has " << endl;
for (row = vec2d.begin(); row != vec2d.end(); row++) {
cout << endl;
for (col = row->begin(); col != row->end(); col++) {
cout << *col << '\t';
}
}
cout << endl;
dataset.close();
dataspace.close();
file.close();
return 0;
}
I compiled it using g++ writeh5.cpp -I/usr/include/hdf5/ -lhdf5_cpp -lhdf5 -Wall
A run of the code produced the following output:
0.325553 0.598941 0.364489 0.0125061
0.374205 0.0319419 0.380329 0.815621
0.863754 0.386279 0.0173515 0.15448
0.703936 0.372486 0.728436 0.991631
0.666207 0.568983 0.807475 0.964276
And the file test.h5
Then when i read this file from python (using the following)
import h5py
import numpy as np
file = h5py.File("test.h5", 'r')
dataset = np.array(file["data"])
print dataset
file.close()
I got
[[ 3.25553381e-001 5.98941262e-001 3.64488814e-001 1.25061036e-002]
[ 0.00000000e+000 2.42092166e-322 3.74204732e-001 3.19418786e-002]
[ 3.80329057e-001 8.15620518e-001 0.00000000e+000 2.42092166e-322]
[ 8.63753530e-001 3.86278684e-001 1.73514970e-002 1.54479635e-001]
[ 0.00000000e+000 2.42092166e-322 7.03935940e-001 3.72486182e-001]]
the first row is good, the other rows are garbage.
I tried with dataset.write(&vec2d[0]... and dataset.write(vec2d[0].data()..., i got similar problems.
I want to
Write a HDF5 file with the contents of a 2D std::vector of doubles,
Read the file in python and store the contents in a numpy array
What i am doing wrong?
Apparently, I am not allowed to pass a std::vector of vectors to the write function. Thus, copying the elements of the vector to an static array solves the problem, because the write function accepts happily this array.
However, I am not happy with this solution, I expected to use the vectors
directly into the write function.
Here is the code:
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <string>
#include <vector>
#include <iterator>
#include <H5Cpp.h>
using namespace H5;
using namespace std;
int main(void) {
int nrow = 5;
int ncol = 4;
vector<vector< double > > vec2d;
vec2d.resize(nrow, vector<double>(ncol, 0.0));
srand((unsigned)time(0));
// generate some data
typename vector< vector< double > >::iterator row;
typename vector< double >::iterator col;
for (row = vec2d.begin(); row != vec2d.end(); row++) {
cout << endl;
for (col = row->begin(); col != row->end(); col++) {
*col = (rand()/(RAND_MAX+1.0));
cout << *col << '\t';
}
}
cout << endl;
double varray[nrow][ncol];
for( int i = 0; i<nrow; ++i) {
cout << endl;
for( int j = 0; j<ncol; ++j) {
varray[i][j] = vec2d[i][j];
}
}
H5File file("test.h5", H5F_ACC_TRUNC);
// dataset dimensions
hsize_t dimsf[2];
dimsf[0] = nrow;
dimsf[1] = ncol;
DataSpace dataspace(2, dimsf);
DataType datatype(H5::PredType::NATIVE_DOUBLE);
DataSet dataset = file.createDataSet("data", datatype, dataspace);
dataset.write(varray, H5::PredType::NATIVE_DOUBLE);
cout << endl;
dataset.close();
dataspace.close();
file.close();
return 0;
}
I ran into the same problem when i converted my data from a vector to a dynamic 2D array. The problem with the h5write command is not that it will not accept a vector, It does not understand the concept of a pointer array. it only writes out contiguous memory. A vector of vectors is not contiguous in memory but instead a pointer array to a bunch of vectors. That is why when you passed the first element of the array the first row was correct. The rest of the table is just the garbage in memory following the first vector.
My solution was creating a giant 1D vector and performing my own indexing to convert back and forth. This is similar to the approach in h5_writedyn.c https://www.hdfgroup.org/ftp/HDF5/examples/misc-examples/h5_writedyn.c
What is this?
gives
0.325553 0.598941 0.364489 0.0125061
0.374205 0.0319419 0.380329 0.815621
0.863754 0.386279 0.0173515 0.15448
0.703936 0.372486 0.728436 0.991631
0.666207 0.568983 0.807475 0.964276
I don't see a print your c++ code. Did you read the file with some other tool?
(yes, this is a clarifying question, but it requires too much formatting to fit in a comment).
https://stackoverflow.com/a/24622720/901925
Writing 2-D array int[n][m] to HDF5 file using Visual C++
The solution talks about writing a vector of vectors. It also talks about writing variable length arrays.
You may have to put in the dataset write in a row iterator
for (row = vec2d.begin(); row != vec2d.end(); row++) {
dataset.write(*row, H5::PredType::NATIVE_DOUBLE);
# or dataset.write(row[0], ...)?
}
}