Reading variable length array in HDF5 C++ - python

I am trying to read an hdf5 file containing variable-length vectors of doubles in C++. I used the following code to create the hdf5 file. It contains one dataset called "test" containing 100 rows of varying lengths. I had to make a couple of changes to the code in the link, so for convenience here is the exact code I used to write the data to hdf5:
#include <iostream>
#include <string>
#include <H5Cpp.h>
#include <vector>
#include <random>
const hsize_t n_dims = 1;
const hsize_t n_rows = 100;
const std::string dataset_name = "test";
int main () {
H5::H5File file("vlen_cpp.hdf5", H5F_ACC_TRUNC);
H5::DataSpace dataspace(n_dims, &n_rows);
// target dtype for the file
auto item_type = H5::PredType::NATIVE_DOUBLE;
auto file_type = H5::VarLenType(&item_type);
// dtype of the generated data
auto mem_type = H5::VarLenType(&item_type);
H5::DataSet dataset = file.createDataSet(dataset_name, file_type, dataspace);
std::vector<std::vector<double>> data;
data.reserve(n_rows);
// this structure stores length of each varlen row and a pointer to
// the actual data
std::vector<hvl_t> varlen_spec(n_rows);
std::mt19937 gen;
std::normal_distribution<double> normal(0.0, 1.0);
std::poisson_distribution<hsize_t> poisson(20);
for (hsize_t idx=0; idx < n_rows; idx++) {
data.emplace_back();
hsize_t size = poisson(gen);
data.at(idx).reserve(size);
varlen_spec.at(idx).len = size;
varlen_spec.at(idx).p = (void*) &data.at(idx).front();
for (hsize_t i = 0; i < size; i++) {
data.at(idx).push_back(normal(gen));
}
}
dataset.write(&varlen_spec.front(), mem_type);
return 0;
}
I am very new to C++ and my issue is trying to read the data back out of this file in C++. I tried to mimic what I would do in Python, but didn't have any luck. In Python, I would do this:
import h5py
import numpy as np
data = h5py.File("vlen_cpp.hdf5", "r")
i = 0 # This is the row I would want to read
arr = data["test"][i] # <-- This is the simplest way.
# Now trying to mimic something closer to C++
did = data["test"].id
dataspace = did.get_space()
dataspace.select_hyperslab(start=(i, ), count=(1, ))
memspace = h5py.h5s.create_simple(dims_tpl=(1, ))
memspace.select_hyperslab(start=(0, ), count=(1, ))
arr = np.zeros((1, ), dtype=object)
did.read(memspace, dataspace, arr)
print(arr) # This gives back the correct data
The python code seems to works fine, so I tried to mimic those steps in C++:
#include <H5Cpp.h>
#include <string>
#include <vector>
#include <stdio.h>
int main(int argc, char **argv) {
std::string filename = argv[1];
// memtype of the file
auto itemType = H5::PredType::NATIVE_DOUBLE;
auto memType = H5::VarLenType(&itemType);
// get dataspace
H5::H5File file(filename, H5F_ACC_RDONLY);
H5::DataSet dataset = file.openDataSet("test");
H5::DataSpace dataspace = dataset.getSpace();
// get the size of the dataset
hsize_t rank;
hsize_t dims[1];
rank = dataspace.getSimpleExtentDims(dims); // rank = 1
std::cout << "Data size: "<< dims[0] << std::endl; // this is the correct number of values
// create memspace
hsize_t memDims[1] = {1};
H5::DataSpace memspace(rank, memDims);
// container to store read data
std::vector<std::vector<double>> data;
// Select hyperslabs
hsize_t dataCount[1] = {1};
hsize_t dataOffset[1] = {0}; // this should be i
hsize_t memCount[1] = {1};
hsize_t memOffset[1] = {0};
dataspace.selectHyperslab(H5S_SELECT_SET, dataCount, dataOffset);
memspace.selectHyperslab(H5S_SELECT_SET, memCount, memOffset);
// vector to store read data
std::vector<double> temp;
temp.reserve(20);
dataset.read(temp.data(), memType, memspace, dataspace);
for (int i = 0; i < temp.size(); i++) {
std::cout << temp[i] << ", ";
}
std::cout << "\n";
return 0;
}
Nothing crashes when I run the C++ program, and the correct number of rows in the "test" dataset is printed (100), but the dataset.read() step isn't working: the first row isn't being read into the vector I want it to be read into (temp). I would greatly appreciate if someone could let me know what I'm doing wrong. Thanks so much.
My goal is to eventually read all 100 rows in the dataset in a loop (placing each row of data into the std:vector temp) and store each one in the std::vectorstd::vector<double> called data. But for now I'm just trying to make sure I can even read the first row.
EDIT: link to hdf5 file
"test" dataset looks like this:
[ 0.16371168 -0.21425339 0.29859526 -0.82794418 0.01021543 1.05546644
-0.546841 1.17456768 0.66068215 -1.04944273 1.48596426 -0.62527598
-2.55912244 -0.82908105 -0.53978052 -0.88870719]
[ 0.33958656 -0.48258915 2.10885699 -0.12130623 -0.2873894 -0.37100313
-1.05934898 -2.3014427 1.45502412 -0.06152739 0.92532768 1.35432642
1.51560926 -0.24327452 1.00886476 0.19749707 0.43894484 0.4394992
-0.12814881]
[ 0.64574273 0.14938582 -0.10369248 1.53727461 0.62404949 1.07824824
1.17066933 1.17196281 -2.05005927 0.13639514 -1.45473056 -1.71462623
-1.11552074 -1.73985207 1.12422121 -1.58694009]
...
EDIT 2:
I've additionally tried without any luck to read the data into (array, armadillo vector, eigen vectorXd). The program does not crash, but what is read into the containers is garbage:
#include <H5Cpp.h>
#include <string>
#include <vector>
#include <stdio.h>
#include <Eigen/Dense>
#include <Eigen/Core>
#include <armadillo>
int main(int argc, char **argv) {
std::string filename = argv[1];
// memtype of the file
auto itemType = H5::PredType::NATIVE_DOUBLE;
auto memType = H5::VarLenType(&itemType);
// get dataspace
H5::H5File file(filename, H5F_ACC_RDONLY);
H5::DataSet dataset = file.openDataSet("test");
H5::DataSpace dataspace = dataset.getSpace();
// get the size of the dataset
hsize_t rank;
hsize_t dims[1];
rank = dataspace.getSimpleExtentDims(dims); // rank = 1
std::cout << "Data size: "<< dims[0] << std::endl; // this is the correct number of values
std::cout << "Data rank: "<< rank << std::endl; // this is the correct rank
// create memspace
hsize_t memDims[1] = {1};
H5::DataSpace memspace(rank, memDims);
// Select hyperslabs
hsize_t dataCount[1] = {1};
hsize_t dataOffset[1] = {0}; // this would be i if reading in a loop
hsize_t memCount[1] = {1};
hsize_t memOffset[1] = {0};
dataspace.selectHyperslab(H5S_SELECT_SET, dataCount, dataOffset);
memspace.selectHyperslab(H5S_SELECT_SET, memCount, memOffset);
// Create storage to hold read data
int i;
int NX = 20;
double data_out[NX];
for (i = 0; i < NX; i++)
data_out[i] = 0;
arma::vec temp(20);
Eigen::VectorXd temp2(20);
// Read data into data_out (array)
dataset.read(data_out, memType, memspace, dataspace);
std::cout << "data_out: " << "\n";
for (i = 0; i < NX; i++)
std::cout << data_out[i] << " ";
std::cout << std::endl;
// Read data into temp (arma vec)
dataset.read(temp.memptr(), memType, memspace, dataspace);
std::cout << "arma vec: " << "\n";
std::cout << temp << std::endl;
// Read data into temp (eigen vec)
dataset.read(temp2.data(), memType, memspace, dataspace);
std::cout << "eigen vec: " << "\n";
std::cout << temp2 << std::endl;
return 0;
}
(ONE) SOLUTION:
After struggling with this a lot, I was able to get a solution working, though admittedly I'm too new to C++ to really understand why this works why and the previous attempts didn't:
#include <H5Cpp.h>
#include <string>
#include <vector>
#include <stdio.h>
int main(int argc, char **argv) {
std::string filename = argv[1];
// Set memtype of the file
auto itemType = H5::PredType::NATIVE_DOUBLE;
auto memType = H5::VarLenType(&itemType);
// Get dataspace
H5::H5File file(filename, H5F_ACC_RDONLY);
H5::DataSet dataset = file.openDataSet("test");
H5::DataSpace dataspace = dataset.getSpace();
// Get the size of the dataset
hsize_t rank;
hsize_t dims[1];
rank = dataspace.getSimpleExtentDims(dims); // rank = 1
std::cout << "Data size: "<< dims[0] << std::endl; // this is the correct number of values
std::cout << "Data rank: "<< rank << std::endl; // this is the correct rank
// Create memspace
hsize_t memDims[1] = {1};
H5::DataSpace memspace(rank, memDims);
// Initialize hyperslabs
hsize_t dataCount[1];
hsize_t dataOffset[1];
hsize_t memCount[1];
hsize_t memOffset[1];
// Create storage to hold read data
hvl_t *rdata = new hvl_t[1];
std::vector<std::vector<double>> dataOut;
for (hsize_t i = 0; i < dims[0]; i++) {
// Select hyperslabs
dataCount[0] = 1;
dataOffset[0] = i;
memCount[0] = 1;
memOffset[0] = 0;
dataspace.selectHyperslab(H5S_SELECT_SET, dataCount, dataOffset);
memspace.selectHyperslab(H5S_SELECT_SET, memCount, memOffset);
// Read out the data
dataset.read(rdata, memType, memspace, dataspace);
double* ptr = (double*)rdata[0].p;
std::vector<double> thisRow;
for (int j = 0; j < rdata[0].len; j++) {
double* val = (double*)&ptr[j];
thisRow.push_back(*val);
}
dataOut.push_back(thisRow);
}
// Confirm data read out properly
for (int i = 0; i < dataOut.size(); i++) {
std::cout << "Row " << i << ":\n";
for (int j = 0; j < dataOut[i].size(); j++) {
std::cout << dataOut[i][j] << " ";
}
std::cout << "\n";
}
return 0;
}
If anyone knows a more efficient way that doesn't involve looping over the elements of each row (i.e. pull out an entire row in one go) that would be really helpful, but for now this works fine for me.

Related

LLDB customize print of template class

I use LLDB as my debugger, and want it to print my template class MyArray<N> in a customized format.
I read the LLDB document, and come up with python script that can get public and private data members of MyArray<N>. However, I don't know how to get N (the template parameter), neither do I know how to get result return by MyArray<N>::size().
Here is the code
#include <stdio.h>
#include <iostream>
template<int N>
class MyArray
{
public:
MyArray(){data = new int[N];}
~MyArray(){if (data) delete[] data;}
int size() const{ return N;}
int& operator[](size_t i) { return data[i];}
int const& operator[](size_t i) const { return data[i];}
private:
int* data = nullptr;
};
template<int N>
std::ostream& operator <<(std::ostream& os, const MyArray<N>& arr)
{
os << "N = " << arr.size() << std::endl;
os << "elements in array:" << std::endl;
for (int i = 0; i < arr.size(); i++) {
if (i > 0) os << ", ";
os << arr[i];
}
return os << std::endl;
}
int main()
{
MyArray<10> arr;
for (int i = 0; i < arr.size(); i++)
arr[i] = 10 + i;
std::cout << arr << std::endl; // Yeah, I can use this for print. but I want this during LLDB debug
return 0;
}
//// Update: Add corresponding lldb config
~/.lldbinit:
command script import ~/.lldbcfg/print_my_array.py
~/.lldbcfg/print_my_array.py:
def print_my_array(valobj, internal_dict):
#N = valobj.GetChildMemberWithName("size") # failed
N = 10
data = valobj.GetChildMemberWithName("data")
info = ''
for i in range(N):
if(i>0): info += ', '
info += str(data.GetChildAtIndex(i).GetValueAsSigned(0))
info += ')'
return info
def __lldb_init_module(debugger, internal_dict):
debugger.HandleCommand('type summary add -P MyArray<10> -F ' + __name__ + '.print_my_array')
The simple way would be to store the value of N as static member:
template<int N>
class MyArray
{
public:
static constexpr const int n = N;
};
Supposed MyArray is not your type you can infer the template argument via a trait:
template <typename T>
struct get_value;
template <int N>
struct get_value<MyArray<N>> {
static constexpr const n = N;
};

Reading certain letters after a specified string from a text file [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 11 months ago.
Improve this question
I want to get out the characters and numbers immediately after the very specific characters "data-permalink=" in a huge text file (50MB). The output should ideally be written in a simple (separate) text file looking something like this:
34k89
456ij
233a4
...
the "data-permalink="" stays always the exact same (as usual in source codes), but the id within can be any combination of characters and numbers. It seemed simple at first, but since it is not at the start of a line, or the needed output is not a separate word I was not able to come up with a working solution at all in the required time. I am running out of time and need a solution or hints to this immediately, so any help is greatly appreciated
example of data in the source data file:
random stuff above
....
I would understand c++ or python the most, so such a solution using these languages would be nice.
I tried something like this:
#include <iostream>
#include <string>
#include <fstream>
using namespace std;
int main()
{
ifstream in ("data.txt");
if(in.fail())
{
cout<<"error";
}
else
{
char c;
while(in.get(c))
{
if(c=="data-permalink=")
cout<<"lol this is awesome"
else
cout<<" ";
}
}
return 0;
}
It is just a random attempt to see if the structure works, nowhere near a solution. This prob. also gives u guys a good guess on how bad i am currently lmao.
Hm, basically 50MB is considered "small" nowadays. With taht small data, you can read the whole file into one std::stringand then do a linear search.
So, the algorithm is:
Open files and check, if they could be opened
Read complete file into a std::string
Do a linear search for the string "data-permalink=""
Remember the start position of the permalink
Search for the closing "
Use the std::strings substrfunction to create the output permalink string
Write this to a file
Goto 1.
I created a 70MB random test file with random data.
The whole procedure takes less than 1s. Even with slow linear search.
But caveat. You want to parse a HTML file. This will most probably not work, because of potential nested structures. For this you should use existing HTML parsers.
Anyway. Here is one of many possible solutions.
#include <iostream>
#include <fstream>
#include <string>
#include <random>
#include <iterator>
#include <algorithm>
std::string randomSourceCharacters{ " abcdefghijklmnopqrstuvwxyz" };
const std::string sourceFileName{ "r:\\test.txt" };
const std::string linkFileName{ "r:\\links.txt" };
void createRandomData() {
std::random_device randomDevice;
std::mt19937 randomGgenerator(randomDevice());
std::uniform_int_distribution<> randomCharacterDistribution(0, randomSourceCharacters.size() - 1);
std::uniform_int_distribution<> randomLength(10, 30);
if (std::ofstream ofs{ sourceFileName }; ofs) {
for (size_t i{}; i < 1000000; ++i) {
const int prefixLength{ randomLength(randomGgenerator) };
const int linkLength{ randomLength(randomGgenerator) };
const int suffixLength{ randomLength(randomGgenerator) };
for (int k{}; k < prefixLength; ++k)
ofs << randomSourceCharacters[randomCharacterDistribution(randomGgenerator)];
ofs << "data-permalink=\"";
for (int k{}; k < linkLength; ++k)
ofs << randomSourceCharacters[randomCharacterDistribution(randomGgenerator)];
ofs << "\"";
for (int k{}; k < suffixLength; ++k)
ofs << randomSourceCharacters[randomCharacterDistribution(randomGgenerator)];
}
}
else std::cerr << "\nError: Could not open source file '" << sourceFileName << "' for writing\n";
}
int main() {
// Please uncomment if you want to create a file with test data
// createRandomData();
// Open source file for reading and check, if file could be opened
if (std::ifstream ifs{ sourceFileName }; ifs) {
// Open link file for writing and check, if file could be opened
if (std::ofstream ofs{ linkFileName }; ofs) {
// Read the complete 50MB file into a string
std::string data(std::istreambuf_iterator<char>(ifs), {});
const std::string searchString{ "data-permalink=\"" };
const std::string permalinkEndString{ "\"" };
// Do a linear search
for (size_t posBegin{}; posBegin < data.length(); ) {
// Search for the begin of the permalink
if (posBegin = data.find(searchString, posBegin); posBegin != std::string::npos) {
const size_t posStartForEndSearch = posBegin + searchString.length() ;
// Search fo the end of the perma link
if (size_t posEnd = data.find(permalinkEndString, posStartForEndSearch); posEnd != std::string::npos) {
// Output result
const size_t lengthPermalink{ posEnd - posStartForEndSearch };
const std::string output{ data.substr(posStartForEndSearch, lengthPermalink) };
ofs << output << '\n';
posBegin = posEnd + 1;
}
else break;
}
else break;
}
}
else std::cerr << "\nError: Could not open source file '" << sourceFileName << "' for reading\n";
}
else std::cerr << "\nError: Could not open source file '" << sourceFileName << "' for reading\n";
}
Edit
If you need unique links you may store the result in an std::unordered_set and then output later.
#include <iostream>
#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
#include <unordered_set>
const std::string sourceFileName{ "r:\\test.txt" };
const std::string linkFileName{ "r:\\links.txt" };
int main() {
// Open source file for reading and check, if file could be opened
if (std::ifstream ifs{ sourceFileName }; ifs) {
// Open link file for writing and check, if file could be opened
if (std::ofstream ofs{ linkFileName }; ofs) {
// Read the complete 50MB file into a string
std::string data(std::istreambuf_iterator<char>(ifs), {});
const std::string searchString{ "data-permalink=\"" };
const std::string permalinkEndString{ "\"" };
// Here we will store unique results
std::unordered_set<std::string> result{};
// Do a linear search
for (size_t posBegin{}; posBegin < data.length(); ) {
// Search for the begin of the permalink
if (posBegin = data.find(searchString, posBegin); posBegin != std::string::npos) {
const size_t posStartForEndSearch = posBegin + searchString.length();
// Search fo the end of the perma link
if (size_t posEnd = data.find(permalinkEndString, posStartForEndSearch); posEnd != std::string::npos) {
// Output result
const size_t lengthPermalink{ posEnd - posStartForEndSearch };
const std::string output{ data.substr(posStartForEndSearch, lengthPermalink) };
result.insert(output);
posBegin = posEnd + 1;
}
else break;
}
else break;
}
for (const std::string& link : result)
ofs << link << '\n';
}
else std::cerr << "\nError: Could not open source file '" << sourceFileName << "' for reading\n";
}
else std::cerr << "\nError: Could not open source file '" << sourceFileName << "' for reading\n";
}

Vector Values (String and Int) Summation C++

I have moved to C++ from Python, and just wanted to know the way to sum the values that a vector (list) object holds. For example, in python, I could use the code below:
totalSum = 0
myList = [1,2,3,4,5]
for i in myList:
totalSum += i
print(totalSum)
// Output:
15
However, I want to learn the way to do this in C++
totalSum = ""
myList = ["Hello", "World!"]
for i in myList:
totalSum += i
totalSum += " "
print(totalSum)
//Output:
Hello World!
And this one is for the string combination.
Could you please provide how to do this in c++?
I have tried the code below in C++ to test, however, it does not compile successfully:
#include <iostream>
#include <vector>
using namespace std;
int main()
{
// A random list/vector here:
vector <double> v = {1, 2, 3, 4, 5};
// Declaring the final string to gather all the vector values:
int sum;
// Iterating through the vector
for (int i = 0; i < v.size(); i++) {
sum += v[i];
}
cout << sum;
return 0;
}
Your code works fine except for the fact you haven't initialized sum variable.
Here is some self-explanatory code discussing what you can use (based on the comments on your question):
#include <iostream>
#include <numeric>
#include <string>
#include <vector>
int main() {
// For strings:
std::string str;
std::vector<std::string> v = {"Hello", "World!"};
// Method 1: Using range-based for loop:
for (auto &&i : v) {
str += i;
str += " ";
}
std::cout << str << std::endl;
// Method 2: Using std::accumulate():
str = std::accumulate(v.begin(), v.end(), std::string(), [](std::string a, std::string b) {
return std::move(a) + b + " ";
});
std::cout << str << std::endl;
// Method 3: The usual for-loop:
str = "";
for (size_t i = 0; i < v.size(); ++i) {
str += v.at(i); // str += v[i];
str += " ";
}
std::cout << str << std::endl;
// Method 4: Using iterators:
str = "";
for (auto i = v.begin(); i < v.end(); ++i) { // for (auto i = std::begin(v); i < std::end(v); std::advance(i))
str += *i;
str += " ";
}
std::cout << str << std::endl;
// For numbers:
std::vector<int> v2 = {1, 2, 3, 4, 5};
int sum = 0;
// Method 1: Using range-based for loop:
for (auto &&i : v2)
sum += i;
std::cout << sum << std::endl;
// Method 2: Using std::accumulate():
sum = std::accumulate(v2.begin(), v2.end(), 0);
std::cout << sum << std::endl;
// Method 3: The usual for-loop:
sum = 0;
for (size_t i = 0; i < v2.size(); ++i)
sum += v2.at(i); // sum += v2[i]
std::cout << sum << std::endl;
// Method 4: Using iterators:
sum = 0;
for (auto i = v2.begin(); i < v2.end(); ++i) // for (auto i = std::begin(v2); i < std::end(v2); std::advance(i))
sum += *i;
std::cout << sum << std::endl;
return 0;
}
You can replace the argument list of lamda passed to std::accumulate to (auto a, auto b) from (std::string a, std::string b) if you are using C++14 or above.
You need to include <iterator> if you are using std::begin() or std::end() or std::advance(). Also you can remove <numeric> if you are not using std::accumulate().
For documentation of any unfamiliar thing you see in my code, kindly visit https://en.cppreference.com/.
The program has an error. In the for loop, you're trying to compare an integer to an unsigned long long int which is returned by v.size() (use -Wall mode in the compiler arguments to get it).
Using for each syntax, an approach is defined as follows:
#include <iostream>
#include <vector>
int main(void) {
std::vector <std::string> v = {"Hello", "World"};
std::string sum;
for (auto i : v) {
sum += i;
sum += ' ';
}
std::cout << sum << std::endl;
return 0;
}
This will print:
Hello World
If you're interested in the STL algorithms, you can achieve this by using std::accumulate:
#include <vector>
#include <numeric>
#include <string>
#include <iostream>
int main()
{
std::vector <double> v = {1, 2, 3, 4, 5};
std::cout << std::accumulate(v.begin(), v.end(), 0.0) << "\n";
std::vector<std::string> s = {"Hello", "World", "abc", "123"};
std::cout << std::accumulate(s.begin(), s.end(), std::string(),
[](auto& total, auto& str) { return total + str + " "; });
}
Output:
15
Hello World abc 123

Can't extract actual data from .dat files in C++?

I'm trying to extract data from .dat (data in file is in 16 bit) file in c++ which is showing garbage data. I'm able to extract it in python (code provided below as well) but my work requires it to be in C++. Here is the C code that I'm using.
Also I would like to know what is the fastest way to extract data since my file are a bit large in size.
#include<iostream>
#define N 4000
using namespace std;
struct record {
char details[1500];
};
int main(int argc, char** argv) {
FILE *fp = fopen("mirror.dat","rb");
record *records;
if (fp==NULL){
cout<<"Problem \n";
system("pause");
return -1;
}
records = new record[N];
fread((record *)records, sizeof(record),N,fp );
fclose(fp);
for(int i=0; i<N;i++){
cout<<"[" << i+1 << "]" << records[i].details << "\n";
}
system("PAUSE");
return 0;
}
Below is the python code.
fpath="mirror.dat"
with open(fpath, 'rb') as r_file:
data=r_file.read()
bits=[data[i+1]<<8 | data[i] for i in range(0, len(data),2)]
print(type(bits))
bits_decod = []
for k in bits:
bits_decod.append(k)
print((bits_decod))
In C++, when you print a char array using <<, it expects it to be a C-style character string.
You need to write a loop that decodes it similarly to the way the Python script does.
#include<iostream>
#define N 4000
using namespace std;
uint8_t data[N * 1500];
uint16_t bits[N * 750];
int main(int argc, char** argv) {
FILE *fp = fopen("mirror.dat","rb");
record *records;
if (fp==NULL){
cout<<"Problem \n";
system("pause");
return 1;
}
size_t data_len = fread((void *)data, sizeof(data),1,fp );
if (data_len < 0) {
cout << "Read error\n";
system("pause");
return 1;
}
fclose(fp);
for (int i = 0; i < data_len; i+=2) {
bits[i/2] = data[i+1] << 8 | data[i];
}
int bits_len = data_len / 2;
for(int i=0; i<bits_len;i++){
cout<<"[" << i+1 << "]" << bits[i] << "\n";
}
system("PAUSE");
return 0;
}
In C++ you can read the contents of a file into a std::vector of uint8_t with the use of std::istream_iterator. Then loop through the vector, decoding the bytes and putting into a vector of uint16_t.
std::istream_iterator<uint8_t>(testFile) is an iterator to beginning of file and std::istream_iterator<uint8_t>() is default-constructed with the special state "end-of-stream". So using this iterator can be used to read from the beginning of the file to the end. We don't have to calculate the size ourselves, and therefore can be used to read varying number of entries in the file.
The equivalent C++ program will look something like this:
#include <iostream>
#include <cstddef>
#include <vector>
#include <iterator>
#include <algorithm>
#include <fstream>
#include <cstdint>
int main()
{
//Open file
std::ifstream testFile("mirror.dat", std::ios::in | std::ios::binary);
if (!testFile)
{
std::cout << "Problem \n";
system("pause");
return 1;
}
//Read in file contents
std::vector<uint8_t> data((std::istream_iterator<uint8_t>(testFile)), std::istream_iterator<uint8_t>());
std::vector<uint16_t> bytes_decoded;
bytes_decoded.reserve(data.size() / 2);
//Decode bytes
for (std::size_t i = 0; i < data.size(); i += 2)
{
bytes_decoded.push_back(data[i + 1] << 8 | data[i]);
}
//Copy decoded bytes to screen with one space between each number
std::copy(bytes_decoded.cbegin(), bytes_decoded.cend(), std::ostream_iterator<uint16_t>(std::cout), " ");
system("PAUSE");
return 0;
}
Note: This requires C++11 or above for the types uint8_t and uint16_t in the header cstdint. You could use unsigned char and unsigned short instead if you don't have a modern C++ compiler.

Access a Numpy Recarray via the C-API

If we have a Numpy recarray:
x = np.array([(1.,2.)], dtype=np.dtype([('a','<f8'),('b','<f8')]))
We can access its fields in Python as:
x['a'] or x['b']
But if this array is passed to a C program as a PyArrayObject how do we access its fields? I realize we can get the dtype in C via: PyArray_Descr *dtype = PyArray_DTYPE(arr)PyObject *fields = dtype->fields but how can this be used to access the data at x['a']?
I'll try to answer my own question.
It appears that you can use the function PyObject_GetItem() to access fields in your Numpy recarray. To test this I created a simple recarray with three fields:
np.dtype([('field1', '<f8', (1,2)), ('field2', '<f8', (2,2)), ('field3', '<f8', (3,1))]) I send this array to my C++ function and exectute two loops: one loop over each field and a nested loop over the array elements in each field (eg. x['field1'], x['field2'], x['field3']). In the outerloop I use PyObject_GetItem() to access each field. The code is as follows:
C++ Code
#include "Python.h"
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#include "arrayobject.h"
#include <cmath>
#include <iostream>
#include <iomanip>
using namespace std;
static PyObject *readarray(PyObject *self, PyObject *args) {
PyArrayObject *arr, *x2;
PyArray_Descr *dtype;
PyObject *names, *name, *x1 = NULL;
Py_ssize_t N, i;
NpyIter *iter;
NpyIter_IterNextFunc *iternext;
double **dataptr;
npy_intp index;
if (!PyArg_ParseTuple(args, "O!", &PyArray_Type, &arr)) {
return NULL;
}
dtype = PyArray_DTYPE(arr);
names = dtype->names;
if (names != NULL) {
names = PySequence_Fast(names, NULL);
N = PySequence_Fast_GET_SIZE(names);
for (i=0; i<N; i++) {
name = PySequence_Fast_GET_ITEM(names, i);
cout << setw(7) << left << PyString_AsString(name);
x1 = PyObject_GetItem((PyObject *) arr, name);
x2 = (PyArrayObject *) x1;
dtype = PyArray_DTYPE(x2);
iter = NpyIter_New(x2, NPY_ITER_READONLY, NPY_KEEPORDER, NPY_SAME_KIND_CASTING, dtype);
if (iter == NULL) {return NULL;}
dataptr = (double **) NpyIter_GetDataPtrArray(iter);
iternext = NpyIter_GetIterNext(iter, NULL);
do {
index = NpyIter_GetIterIndex(iter);
if (index==0) {
cout << setw(6) << right << index << setw(9) << setiosflags(ios::fixed) << setprecision(4) <<**dataptr << endl;
} else {
cout << " " << setw(6) << right << index << setw(9) << setiosflags(ios::fixed) << setprecision(4) << **dataptr << endl;
}
} while (iternext(iter));
}
NpyIter_Deallocate(iter);
}
return Py_BuildValue("i", 0);
}
static PyMethodDef pyproj4methods[] = {
{"readarray", readarray, METH_VARARGS, "Documentation"},
{NULL, NULL, 0, NULL}
};
PyMODINIT_FUNC initpyproj4(void) {
Py_InitModule("pyproj4", pyproj4methods);
import_array();
}
Python Code
import numpy as np
import pyproj4 as p4
np.random.seed(22)
## Python Implementation ##
dt = np.dtype([('field1', '<f8', (1,2)), ('field2', '<f8', (2,2)), ('field3', '<f8', (3,1))])
x = np.zeros(2, dtype=dt)
for name in x.dtype.names:
m,n,p = x[name].shape
x[name] = np.random.randn(m,n,p)
it = np.nditer(x[name], ['c_index'], ['readonly'])
for num in it:
if it.index==0:
print '{0:6s} {1:6d} {2: 2.4f}'.format(name, it.index, num.item())
else:
print '{0:6s} {1:6d} {2: 2.4f}'.format(' ', it.index, num.item())
print '-----------------------'
## C-API Implementation ##
p4.readarray(x)
The output in both cases looks like:
field1 0 -0.0919
1 -1.4634
2 1.0818
3 -0.2393
field2 0 -0.4911
1 -1.0023
2 0.9188
3 -1.1036
4 0.6265
5 -0.5615
6 0.0289
7 -0.2308
field3 0 0.5878
1 0.7523
2 -1.0585
3 1.0560
4 0.7478
5 1.0647
If you know a better way to accomplish this, please don't hesitate to post your solution.

Categories