Read string in HDF5/C++ - python

I have stored a string (and a vector) in my HDF5 archive, for example with the Python interface:
import h5py
file = h5py.File("example.h5","w")
file['/path/to/vector'] = [0., 1., 2.]
file['/path/to/string'] = 'test'
Now I want the read the string to a std::string. I know how to read the vector (see below), but I have absolutely no idea how to read the string. What particularly don't understand is how to allocate the result, as:
The H5Cpp library does not seem to use the STL-containers, but rather raw pointers, requiring pre-allocation.
This is somewhat contradicted by the observation HDFView indicates the dimension size to be 1 and the type to be "String, length = variable".
Here is how I read the vector:
#include "H5Cpp.h"
#include <vector>
#include <iostream>
int main()
{
// open file
H5::H5File fid = H5::H5File("example.h5",H5F_ACC_RDONLY);
// open dataset
H5::DataSet dataset = fid.openDataSet("/path/to/vector");
H5::DataSpace dataspace = dataset.getSpace();
H5T_class_t type_class = dataset.getTypeClass();
// check data type
if ( type_class != H5T_FLOAT )
throw std::runtime_error("Unable to read, incorrect data-type");
// check precision
// - get storage type
H5::FloatType datatype = dataset.getFloatType();
// - get number of bytes
size_t precision = datatype.getSize();
// - check precision
if ( precision != sizeof(double) )
throw std::runtime_error("Unable to read, incorrect precision");
// get the size
// - read rank (a.k.a number of dimensions)
int rank = dataspace.getSimpleExtentNdims();
// - allocate
hsize_t hshape[rank];
// - read
dataspace.getSimpleExtentDims(hshape, NULL);
// - total size
size_t size = 0;
for ( int i = 0 ; i < rank ; ++i ) size += static_cast<size_t>(hshape[i]);
// allocate output
std::vector<double> data(size);
// read data
dataset.read(const_cast<double*>(data.data()), H5::PredType::NATIVE_DOUBLE);
// print data
for ( auto &i : data )
std::cout << i << std::endl;
}
(compiled with h5c++ -std=c++14 so.cpp)

I have found a solution:
#include "H5Cpp.h"
#include <vector>
#include <iostream>
int main()
{
// open file
H5::H5File fid = H5::H5File("example.h5",H5F_ACC_RDONLY);
// open dataset, get data-type
H5::DataSet dataset = fid.openDataSet("/path/to/string");
H5::DataSpace dataspace = dataset.getSpace();
H5::StrType datatype = dataset.getStrType();
// allocate output
std::string data;
// read output
dataset.read(data, datatype, dataspace);
std::cout << data << std::endl;
}

Related

Transfering C++ CRC16 modbus rtu calculator to python

Please excuse me if the question will be simple, I'm kinda new with CRC.
I've got cpp CRC16 func with defined crc table and I want exactly the same results in python. I used some libraries, but all of them gave me different results. The reason is probably with diference in crc tables, but all the algorithms that I found don't have defined tables that I can swap with mine.
Original func:
uint16_t CRC16(const uint8_t *data, int len)
{
static const uint16_t crc_table[] = {
0x0000,0x8005,0x800F,0x000A,0x801B,0x001E,0x0014,0x8011,
0x8033,0x0036,0x003C,0x8039,0x0028,0x802D,0x8027,0x0022,
0x8063,0x0066,0x006C,0x8069,0x0078,0x807D,0x8077,0x0072,
0x0050,0x8055,0x805F,0x005A,0x804B,0x004E,0x0044,0x8041,
0x80C3,0x00C6,0x00CC,0x80C9,0x00D8,0x80DD,0x80D7,0x00D2,
0x00F0,0x80F5,0x80FF,0x00FA,0x80EB,0x00EE,0x00E4,0x80E1,
0x00A0,0x80A5,0x80AF,0x00AA,0x80BB,0x00BE,0x00B4,0x80B1,
0x8093,0x0096,0x009C,0x8099,0x0088,0x808D,0x8087,0x0082,
0x8183,0x0186,0x018C,0x8189,0x0198,0x819D,0x8197,0x0192,
0x01B0,0x81B5,0x81BF,0x01BA,0x81AB,0x01AE,0x01A4,0x81A1,
0x01E0,0x81E5,0x81EF,0x01EA,0x81FB,0x01FE,0x01F4,0x81F1,
0x81D3,0x01D6,0x01DC,0x81D9,0x01C8,0x81CD,0x81C7,0x01C2,
0x0140,0x8145,0x814F,0x014A,0x815B,0x015E,0x0154,0x8151,
0x8173,0x0176,0x017C,0x8179,0x0168,0x816D,0x8167,0x0162,
0x8123,0x0126,0x012C,0x8129,0x0138,0x813D,0x8137,0x0132,
0x0110,0x8115,0x811F,0x011A,0x810B,0x010E,0x0104,0x8101,
0x8303,0x0306,0x030C,0x8309,0x0318,0x831D,0x8317,0x0312,
0x0330,0x8335,0x833F,0x033A,0x832B,0x032E,0x0324,0x8321,
0x0360,0x8365,0x836F,0x036A,0x837B,0x037E,0x0374,0x8371,
0x8353,0x0356,0x035C,0x8359,0x0348,0x834D,0x8347,0x0342,
0x03C0,0x83C5,0x83CF,0x03CA,0x83DB,0x03DE,0x03D4,0x83D1,
0x83F3,0x03F6,0x03FC,0x83F9,0x03E8,0x83ED,0x83E7,0x03E2,
0x83A3,0x03A6,0x03AC,0x83A9,0x03B8,0x83BD,0x83B7,0x03B2,
0x0390,0x8395,0x839F,0x039A,0x838B,0x038E,0x0384,0x8381,
0x0280,0x8285,0x828F,0x028A,0x829B,0x029E,0x0294,0x8291,
0x82B3,0x02B6,0x02BC,0x82B9,0x02A8,0x82AD,0x82A7,0x02A2,
0x82E3,0x02E6,0x02EC,0x82E9,0x02F8,0x82FD,0x82F7,0x02F2,
0x02D0,0x82D5,0x82DF,0x02DA,0x82CB,0x02CE,0x02C4,0x82C1,
0x8243,0x0246,0x024C,0x8249,0x0258,0x825D,0x8257,0x0252,
0x0270,0x8275,0x827F,0x027A,0x826B,0x026E,0x0264,0x8261,
0x0220,0x8225,0x822F,0x022A,0x823B,0x023E,0x0234,0x8231,
0x8213,0x0216,0x021C,0x8219,0x0208,0x820D,0x8207,0x0202
};
uint16_t crc_word = 0xFFFF;
while (len--)
crc_word = (crc_word << 8) ^ crc_table[(((crc_word >> 8) & 0x00FF) ^ *data++) & 0x00FF];
return crc_word;
}
uint16_t crc = CRC16(((uint8_t*)&tx_frame) + 1, (int)(tx_frame.num_data_bytes + 1));
uint8_t crc1 = crc >> 8;
uint8_t crc2 = 0xFF & crc;
tx_frame.crc = crc1 + crc2 * 256;
The parameters and name of that CRC (click on link for more information):
width=16 poly=0x8005 init=0xffff refin=false refout=false xorout=0x0000 check=0xaee7 residue=0x0000 name="CRC-16/CMS"
In Python:
>>> import crcmod
>>> crc16 = crcmod.mkCrcFun(0x18005, rev=False, initCrc=0xffff)
>>> print(hex(crc16(b'123456789')))
0xaee7

PyImport_Import segmentation fault after reading in TSV with C++

I am using C++ as a wrapper around a Python module. First, I read in a TSV file, cast it as a numpy array, import my Python module, and then pass the numpy array to Python for further analysis. When I first wrote the program, I was testing everything using a randomly generated array, and it worked well. However, once I replaced the randomly generated array with the imported TSV array, I got a segmentation fault when I tried to import the Python module. Here is some of my code:
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#define PY_SSIZE_T_CLEAN
#include <python3.8/Python.h>
#include "./venv/lib/python3.8/site-packages/numpy/core/include/numpy/arrayobject.h"
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
#include <fstream>
#include <sstream>
int main(int argc, char* argv[]) {
setenv("PYTHONPATH", ".", 0);
Py_Initialize();
import_array();
static const int numberRows = 1000;
static const int numberColumns = 500;
npy_intp dims[2]{ numberRows, numberColumns };
static const int numberDims = 2;
double(*c_arr)[numberColumns]{ new double[numberRows][numberColumns] };
// ***********************************************************
// THIS PART OF THE CODE GENERATES A RANDOM ARRAY AND WORKS WITH THE REST OF THE CODE
// // initialize random number generation
// typedef std::mt19937 MyRNG;
// std::random_device r;
// MyRNG rng{r()};
// std::lognormal_distribution<double> lognormalDistribution(1.6, 0.25);
// //populate array
// for (int i=0; i < numberRows; i++) {
// for (int j=0; j < numberColumns; j++) {
// c_arr[i][j] = lognormalDistribution(rng);
// }
// }
// ***********************************************************
// ***********************************************************
// THIS PART OF THE CODE INGESTS AN ARRAY FROM TSV AND CAUSES CODE TO FAIL AT PyImport_Import
std::ifstream data("data.mat");
std::string line;
int row = 0;
int column = 0;
while (std::getline(data, line)) {
std::stringstream lineStream(line);
std::string cell;
while (std::getline(lineStream, cell, '\t')) {
c_arr[row][column] = std::stod(cell);
column++;
}
row++;
column = 0;
if (row > numberRows) {
break;
}
}
// ***********************************************************
PyArrayObject *npArray = reinterpret_cast<PyArrayObject*>(
PyArray_SimpleNewFromData(numberDims, dims, NPY_DOUBLE, reinterpret_cast<void*>(c_arr))
);
const char *moduleName = "cpp_test";
PyObject *pname = PyUnicode_FromString(moduleName);
// ***********************************************************
// CODE FAILS HERE - SEGMENTATION FAULT
PyObject *pyModule = PyImport_Import(pname);
// .......
// THERE IS MORE CODE BELOW NOT INCLUDED HERE
}
So, I'm not sure why the code fails when ingest data from a TSV file, but not when I use randomly generated data.
EDIT: (very stupid mistake incoming) I used the conditional row > numberRows for the stopping condition in the while loop and so this affected the row number used for the final line in the array. Once I changed that conditional to row == numberRows, everything worked. Who knew being specific about rows when building an array was so important? I'll leave this up as a testament to stupid programming mistakes and maybe someone will learn a little something from it.
Note that you don't have to use arrays for storing the information(like double values) in 2D manner because you can also use dynamically sized containers like std::vector as shown below. The advantage of using std::vector is that you don't have to know the number of rows and columns beforehand in your input file(data.mat). So you don't have to allocate memory beforehand for rows and columns. You can add the values dynamically.
#include <iostream>
#include <vector>
#include <string>
#include <sstream>
#include<fstream>
int main() {
std::string line;
double word;
std::ifstream inFile("data.mat");
//create/use a std::vector instead of builit in array
std::vector<std::vector<double>> vec;
if(inFile)
{
while(getline(inFile, line, '\n'))
{
//create a temporary vector that will contain all the columns
std::vector<double> tempVec;
std::istringstream ss(line);
//read word by word(or double by double)
while(ss >> word)
{
//std::cout<<"word:"<<word<<std::endl;
//add the word to the temporary vector
tempVec.push_back(word);
}
//now all the words from the current line has been added to the temporary vector
vec.emplace_back(tempVec);
}
}
else
{
std::cout<<"file cannot be opened"<<std::endl;
}
inFile.close();
//lets check out the elements of the 2D vector so the we can confirm if it contains all the right elements(rows and columns)
for(std::vector<double> &newvec: vec)
{
for(const double &elem: newvec)
{
std::cout<<elem<<" ";
}
std::cout<<std::endl;
}
return 0;
}
The output of the above program can be seen here. Since you didn't provide data.mat file, i created an example data.mat file and used it in my program which can be found at the above mentioned link.

How to divide a binary file to 6-byte blocks in C++ or Python with fast speed? [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 3 years ago.
Improve this question
I’m reading a file in C++ and Python as a binary file. I need to divide the binary into blocks, each 6 bytes. For example, if my file is 600 bytes, the result should be 100 blocks, each 6 bytes.
I have tried struct (in C++ and Python) and array (Python). None of them divide the binary into blocks of 6 bytes. They can only divide the binary into blocks each power of two (1, 2, 4, 8, 16, etc.).
The array algorithm was very fast, reading 1 GB of binary data in less than a second as blocks of 4 bytes. In contrast, I used some other methods, but all of them are extremely slow, taking tens of minutes to do it for a few megabytes.
How can I read the binary as blocks of 6 bytes as fast as possible? Any help in either C++ or Python will be great. Thank you.
EDIT - The Code:
struct Block
{
char data[6];
};
class BinaryData
{
private:
char data[6];
public:
BinaryData() {};
~BinaryData() {};
void readBinaryFile(string strFile)
{
Block block;
ifstream binaryFile;
int size = 0;
binaryFile.open(strFile, ios::out | ios::binary);
binaryFile.seekg(0, ios::end);
size = (int)binaryFile.tellg();
binaryFile.seekg(0, ios::beg);
cout << size << endl;
while ( (int)binaryFile.tellg() < size )
{
cout << binaryFile.tellg() << " , " << size << " , " <<
size - (int)binaryFile.tellg() << endl;
binaryFile.read((char*)block.data,sizeof(block.data));
cout << block.data << endl;
//cin >> block.data;
if (size - (int)binaryFile.tellg() > size)
{
break;
}
}
binaryFile.close();
}
};
Notes :
in the file the numbers are in big endian ( remark )
the goal is to as fast as possible read them then sort them in ascending order ( remark )
Let's start simple, then optimize.
Simple Loop
uint8_t array1[6];
while (my_file.read((char *) &array1[0], 6))
{
Process_Block(&array1[0]);
}
The above code reads in a file, 6 bytes at a time and sends the block to a function.
Meets the requirements, not very optimal.
Reading Larger Blocks
Files are streaming devices. They have an overhead to start streaming, but are very efficient to keep streaming. In other words, we want to read as much data per transaction to reduce the overhead.
static const unsigned int CAPACITY = 6 * 1024;
uint8_t block1[CAPACITY];
while (my_file.read((char *) &block1[0], CAPACITY))
{
const size_t bytes_read = my_file.gcount();
const size_t blocks_read = bytes_read / 6;
uint8_t const * block_pointer = &block1[0];
while (blocks_read > 0)
{
Process_Block(block_pointer);
block_pointer += 6;
--blocks_read;
}
}
The above code reads up to 1024 blocks in one transaction. After reading, each block is sent to a function for processing.
This version is more efficient than the Simple Loop, as it reads more data per transaction. Adjust the CAPACITY to find the optimal size on your platform.
Loop Unrolling
The previous code reduces the first bottleneck of input transfer speed (although there is still room for optimization). Another technique is to reduce the overhead of the processing loop by performing more data processing inside the loop. This is called loop unrolling.
const size_t bytes_read = my_file.gcount();
const size_t blocks_read = bytes_read / 6;
uint8_t const * block_pointer = &block1[0];
while ((blocks_read / 4) != 0)
{
Process_Block(block_pointer);
block_pointer += 6;
Process_Block(block_pointer);
block_pointer += 6;
Process_Block(block_pointer);
block_pointer += 6;
Process_Block(block_pointer);
block_pointer += 6;
blocks_read -= 4;
}
while (blocks_read > 0)
{
Process_Block(block_pointer);
block_pointer += 6;
--blocks_read;
}
You can adjust the quantity of operations in the loop, to see how it affects your program's speed.
Multi-Threading & Multiple Buffers
Another two techniques for speeding up the reading of the data, are to use multiple threads and multiple buffers.
One thread, an input thread, reads the file into a buffer. After reading into the first buffer, the thread sets a semaphore indicating there is data to process. The input thread reads into the next buffer. This repeats until the data is all read. (For a challenge, figure out how to reuse the buffers and notify the other thread of which buffers are available).
The second thread is the processing thread. This processing thread is started first and waits for the first buffer to be completely read. After the buffer has the data, the processing thread starts processing the data. After the first buffer has been processed, the processing thread starts on the next buffer. This repeats until all the buffers have been processed.
The goal here is to use as many buffers as necessary to keep the processing thread running and not waiting.
Edit 1: Other techniques
Memory Mapped Files
Some operating systems support memory mapped files. The OS reads a portion of the file into memory. When a location outside the memory is accessed, the OS loads another portion into memory. Whether this technique improves performance needs to be measured (profiled).
Parallel Processing & Threading
Adding multiple threads may show negligible performance gain. Computers have a data bus (data highway) connecting many hardware devices, including memory, file I/O and the processor. Devices will be paused to let other devices use the data highway. With multiple cores or processors, one processor may have to wait while the other processor is using the data highway. This waiting may cause negligible performance gain when using multiple threads or parallel processing. Also, the operating system has overhead when constructing and maintaining threads.
Try that, the input file is received in argument of the program, as you said I suppose the the 6 bytes values in the file are written in the big endian order, but I do not make assumption for the program reading the file then sorting and it can be executed on both little and big endian (I check the case at the execution)
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdint>
#include <algorithm>
#include <limits.h> // CHAR_BIT
using namespace std;
#if CHAR_BIT != 8
# error that code supposes a char has 8 bits
#endif
int main(int argc, char ** argv)
{
if (argc != 2)
cerr << "Usage: " << argv[1] << " <file>" << endl;
else {
ifstream in(argv[1], ios::binary);
if (!in.is_open())
cerr << "Cannot open " << argv[1] << endl;
else {
in.seekg(0, ios::end);
size_t n = (size_t) in.tellg() / 6;
vector<uint64_t> values(n);
uint64_t * p = values.data(); // for performance
uint64_t * psup = p + n;
in.seekg(0, ios::beg);
int i = 1;
if (*((char *) &i)) {
// little endian
unsigned char s[6];
uint64_t v = 0;
while (p != psup) {
if (!in.read((char *) s, 6))
return -1;
((char *) &v)[0] = s[5];
((char *) &v)[1] = s[4];
((char *) &v)[2] = s[3];
((char *) &v)[3] = s[2];
((char *) &v)[4] = s[1];
((char *) &v)[5] = s[0];
*p++ = v;
}
}
else {
// big endian
uint64_t v = 0;
while (p != psup) {
if (!in.read(((char *) &v) + 2, 6))
return -1;
*p++ = v;
}
}
cout << "file successfully read" << endl;
sort(values.begin(), values.end());
cout << "values sort" << endl;
// DEBUG, DO ON A SMALL FILE ;-)
for (auto v : values)
cout << v << endl;
}
}
}

Serialize raw Image buffer (rgb pixels) in C and deserialize in Python

I want to serialize raw image data i.e. uint16 array, and send it over to python using zmq. I am considered using msgPack-c but the only way I found was something like given How do I unpack and extract data properly using msgpack-c?.
if I follow this approach I have to pack each element in my C array separately, which will make it very slow.
Could someone please point to the right direction.
You can send uint16_t array from c side as is, and use ctypes module to access it in python code.
Sending c code:
#include <stdint.h>
#include <stdio.h>
#include <zmq.h>
#define IMAGE_SIZE (256 * 256)
unsigned checksum(uint16_t* data, int len) {
unsigned s = 0;
for (int i = 0; i < len; ++i) {
s += data[i];
}
return s;
}
int main() {
uint16_t image[IMAGE_SIZE];
printf("image checksum: %i\n", checksum(image, IMAGE_SIZE));
void* context = zmq_ctx_new();
void* push = zmq_socket(context, ZMQ_PUSH);
zmq_connect(push, "tcp://127.0.0.1:5555");
zmq_send(push, image, IMAGE_SIZE * sizeof(uint16_t), 0);
zmq_close(push);
zmq_ctx_destroy(context);
return 0;
}
Receiving python code:
from ctypes import c_uint16
import zmq
IMAGE_SIZE = 256 * 256
Image = c_uint16 * IMAGE_SIZE # corresponds to uint16_t[IMAGE_SIZE]
context = zmq.Context(1)
pull = zmq.Socket(context, zmq.PULL)
pull.bind("tcp://127.0.0.1:5555")
message = pull.recv()
image = Image.from_buffer_copy(message)
# This should print the same number as the sending code
# Note that it is different from sum(message)
print(sum(image))

How to save and restore the state of a random number generator with CUDA CURAND?

I am using a large CUDA-matrix library developed within our organization. I need to save the state of a CUDA RNG to take a snapshop of a long-running simulation, and be able to restore it later. This is simple with, e.g., python+numpy:
state = numpy.random.get_state()
# state is a tuple with 5 fields which can be pickled, etc.
...
numpy.random.set_state(state)
I cannot seem to find equivalent functionality in the CUDA host api. You can set the seed and offset, but there is no way to retrieve it to save. The device API seems to offer something like this, but this library uses the host api, and it would be monsterous to change.
The hack-ey solution I am thinking about is to keep track of the number of calls to the RNG (reset when a seed is set), and simply call a RNG function repeatedly. However, I am not sure if the function parameters must be identical, e.g. matrix shapes, etc., to get it to the same state. Similarly, if the number of calls was equivalent to the offset parameter for initializing the RNG, this would work as well, i.e., if I call the RNG 200 times, I could set the offset to 200. However, in python, the offset in the state can increase by more than 1 with each call, so this is also potentially wrong.
Any insights into how to tackle this are appreciated!
For the CURAND Host API, I believe curandSetGeneratorOffset() can probably work for this.
Here's a modified example from the curand host API documentation:
$ cat t721.cu
/*
* This program uses the host CURAND API to generate 10
* pseudorandom floats. And then regenerate those same floats.
*/
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
int main(int argc, char *argv[])
{
size_t n = 10;
size_t i;
curandGenerator_t gen;
float *devData, *hostData;
/* Allocate n floats on host */
hostData = (float *)calloc(n, sizeof(float));
/* Allocate n floats on device */
CUDA_CALL(cudaMalloc((void **)&devData, n*sizeof(float)));
/* Create pseudo-random number generator */
CURAND_CALL(curandCreateGenerator(&gen,
CURAND_RNG_PSEUDO_DEFAULT));
/* Set seed */
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen,
1234ULL));
// generator offset = 0
/* Generate n floats on device */
CURAND_CALL(curandGenerateUniform(gen, devData, n));
// generator offset = n
/* Generate n floats on device */
CURAND_CALL(curandGenerateUniform(gen, devData, n));
// generator offset = 2n
/* Copy device memory to host */
CUDA_CALL(cudaMemcpy(hostData, devData, n * sizeof(float),
cudaMemcpyDeviceToHost));
/* Show result */
for(i = 0; i < n; i++) {
printf("%1.4f ", hostData[i]);
}
printf("\n\n");
CURAND_CALL(curandSetGeneratorOffset(gen, n));
// generator offset = n
CURAND_CALL(curandGenerateUniform(gen, devData, n));
// generator offset = 2n
/* Copy device memory to host */
CUDA_CALL(cudaMemcpy(hostData, devData, n * sizeof(float),
cudaMemcpyDeviceToHost));
/* Show result */
for(i = 0; i < n; i++) {
printf("%1.4f ", hostData[i]);
}
printf("\n");
/* Cleanup */
CURAND_CALL(curandDestroyGenerator(gen));
CUDA_CALL(cudaFree(devData));
free(hostData);
return EXIT_SUCCESS;
}
$ nvcc -o t721 t721.cu -lcurand
$ ./t721
0.7816 0.2338 0.6791 0.2824 0.6299 0.1212 0.4333 0.3831 0.5136 0.2987
0.7816 0.2338 0.6791 0.2824 0.6299 0.1212 0.4333 0.3831 0.5136 0.2987
$
So you'll need to keep track of the quantity of random numbers generated (not the number of RNG function calls) up to the point when you do your checkpoint, and save that.
When you restart, initialize the generator in the same way:
/* Create pseudo-random number generator */
CURAND_CALL(curandCreateGenerator(&gen,
CURAND_RNG_PSEUDO_DEFAULT));
/* Set seed */
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen,
1234ULL));
but then advance by the number of previously generated values (n):
CURAND_CALL(curandSetGeneratorOffset(gen, n));
So, it is possible to store and restore the state by tracking the number of 32-bit values generated using curandSetGeneratorOffset. The algorithm looks something like:
template<typename T> RNG(T* X, size_T N /*number of values*/){
...
if (sizeof(T) == 1)
offset += (N+4-1)/4;
else if (sizeof(T) == 2)
offset += (N+2-1)/4;
else if (sizeof(T) == 4 || USING_GENERATE_UNIFORM_DOUBLE)
offset += N;
else if (sizeof(T) == 8)
offset += 2*N;
}
For 8-bit values, advance the offset by the N * next highest multiple of 4, for N values generated. For 16, advance by N * the next multiple of 2. For 32 advance by the N, and for 64 advance by 2*N.
HOWEVER, if you use GenerateUniformDouble, you only need to advance by N, not 2*N. I'm not sure why.
Thanks for the help!

Categories