join_datasets
C++ Function Reference
1 Signature
int BigDataStatMeth::join_datasets(T *dsJoined, std::string strsubgroup, Rcpp::StringVector strinput, bool bremoveJoined, bool byCols)2 Description
Joins multiple HDF5 datasets into a single dataset within the same group.
3 Parameters
dsJoined(T *): Pointer to the output dataset where joined data will be storedstrsubgroup(std::string): Subgroup path where the datasets are locatedstrinput(Rcpp::StringVector): Vector of input dataset names to joinbremoveJoined(bool): Flag to remove original datasets after joiningbyCols(bool): Flag indicating whether to join by columns
4 Returns
int Returns 0 on success, -1 on failure
5 Details
TDataset type (must be either hdf5Dataset* or hdf5DatasetInternal*) dsJoinedPointer to the output dataset where joined data will be stored strsubgroupSubgroup path where the datasets are located strinputVector of input dataset names to join bremoveJoinedFlag to remove original datasets after joining byColsFlag indicating whether to join by columnsint Returns 0 on success, -1 on failureH5::FileIExceptionon file operation errors H5::DataSetIExceptionon dataset operation errors H5::GroupIExceptionon group operation errors H5::DataSpaceIExceptionon dataspace operation errors H5::DataTypeIExceptionon datatype operation errors std::exceptionon general errorsThe function uses unlimited datasets to allow for dynamic growth Memory is managed efficiently using Eigen’s mapping capabilities Performance considerations:Uses block-wise reading and writing for memory efficiencyImplements Eigen for fast matrix operationsAutomatically extends dataset size as needed
6 Call Graph
7 Source Code
File: inst/include/hdf5Utilities/hdf5Methods.hpp • Lines 72-178
int join_datasets ( T* dsJoined, std::string strsubgroup, Rcpp::StringVector strinput, bool bremoveJoined, bool byCols )
{
static_assert(std::is_same<T*, BigDataStatMeth::hdf5Dataset* >::value ||
std::is_same<T*, BigDataStatMeth::hdf5DatasetInternal* >::value,
"Error - type not allowed");
try{
H5::Exception::dontPrint();
std::vector<hsize_t> stride = {1, 1},
block = {1, 1},
offset = {0, 0},
count = {0, 0};
std::string stroutdataset = dsJoined->getDatasetName();
BigDataStatMeth::hdf5Dataset* dstoJoin = new hdf5Dataset(dsJoined->getFullPath(), strsubgroup, Rcpp::as<std::string>(strinput[0]), false);
dstoJoin->openDataset();
hsize_t* dims_out = dstoJoin->dim();
// Add rows and needed cols to add the merged data in the new dataset
dsJoined->createUnlimitedDataset( (unsigned long long)dims_out[0], (unsigned long long)dims_out[1], "real");
dsJoined->openDataset();
// Read data to merge
std::vector<double> vreadeddata( dims_out[0] * dims_out[1] );
dstoJoin->readDatasetBlock( {offset[0], offset[1]}, {dims_out[0], dims_out[1]}, stride, block, vreadeddata.data() );
{
Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> readedData = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> (vreadeddata.data(), dims_out[0], dims_out[1] );
count[0] = dims_out[0]; count[1] = dims_out[1];
// Write data to the new dataset
dsJoined->writeDatasetBlock( Rcpp::wrap(readedData), offset, count, stride, block, false);
}
delete dstoJoin; // Remove original dataset link
// Update offset to new position
offset[1] = offset[1] + dims_out[1];
for( int i=1; i<strinput.size(); i++)
{
dstoJoin = new hdf5Dataset(dsJoined->getFullPath(), strsubgroup, Rcpp::as<std::string>(strinput[i]), false);
dstoJoin->openDataset();
dims_out = dstoJoin->dim();
// Extend dataset before put data
dsJoined->extendUnlimitedDataset( (unsigned long long)0, (unsigned long long)dims_out[1] );
// Read data to merge
std::vector<double> vreadeddata( dims_out[0] * dims_out[1] );
dstoJoin->readDatasetBlock( {0, 0}, {dims_out[0], dims_out[1]}, stride, block, vreadeddata.data() );
Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> readedData = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> (vreadeddata.data(), dims_out[0], dims_out[1] );
delete dstoJoin;
count[0] = dims_out[0]; count[1] = dims_out[1];
// Write data to the new dataset
dsJoined->writeDatasetBlock( Rcpp::wrap(readedData), offset, count, stride, block, false);
// Update offset
offset[1] = offset[1] + dims_out[1];
}
if(bremoveJoined == true) {
// Remove joined elements
BigDataStatMeth::remove_elements(dsJoined->getFileptr(), strsubgroup, strinput);
}
} catch(H5::FileIException& error) { // catch failure caused by the H5File operations
checkClose_file(dsJoined);
Rcpp::Rcerr<<"c++ exception join_datasets (File IException)" << std::endl;
return -1;
} catch(H5::DataSetIException& error) { // catch failure caused by the DataSet operations
checkClose_file(dsJoined);
Rcpp::Rcerr<<"c++ exception join_datasets (DataSet IException)" << std::endl;
return -1;
} catch(H5::GroupIException& error) { // catch failure caused by the Group operations
checkClose_file(dsJoined);
Rcpp::Rcerr<<"c++ exception join_datasets (Group IException)" << std::endl;
return -1;
} catch(H5::DataSpaceIException& error) { // catch failure caused by the DataSpace operations
checkClose_file(dsJoined);
Rcpp::Rcerr<<"c++ exception join_datasets (DataSpace IException)" << std::endl;
return -1;
} catch(H5::DataTypeIException& error) { // catch failure caused by the DataSpace operations
checkClose_file(dsJoined);
Rcpp::Rcerr<<"c++ exception join_datasets (Data TypeIException)" << std::endl;
return -1;
} catch(std::exception &ex) {
checkClose_file(dsJoined);
Rcpp::Rcerr << "c++ exception join_datasets: " << ex.what();
return -1;
} catch (...) {
checkClose_file(dsJoined);
Rcpp::Rcerr<<"C++ exception join_datasets (unknown reason)";
return -1;
}
return(0);
}8 Usage Example
#include "BigDataStatMeth.hpp"
// Example usage
auto result = join_datasets(...);