join_datasets

C++ Function Reference

1 Signature

int BigDataStatMeth::join_datasets(T *dsJoined, std::string strsubgroup, Rcpp::StringVector strinput, bool bremoveJoined, bool byCols)

2 Description

Joins multiple HDF5 datasets into a single dataset within the same group.

3 Parameters

  • dsJoined (T *): Pointer to the output dataset where joined data will be stored
  • strsubgroup (std::string): Subgroup path where the datasets are located
  • strinput (Rcpp::StringVector): Vector of input dataset names to join
  • bremoveJoined (bool): Flag to remove original datasets after joining
  • byCols (bool): Flag indicating whether to join by columns

4 Returns

int Returns 0 on success, -1 on failure

5 Details

TDataset type (must be either hdf5Dataset* or hdf5DatasetInternal*) dsJoinedPointer to the output dataset where joined data will be stored strsubgroupSubgroup path where the datasets are located strinputVector of input dataset names to join bremoveJoinedFlag to remove original datasets after joining byColsFlag indicating whether to join by columnsint Returns 0 on success, -1 on failureH5::FileIExceptionon file operation errors H5::DataSetIExceptionon dataset operation errors H5::GroupIExceptionon group operation errors H5::DataSpaceIExceptionon dataspace operation errors H5::DataTypeIExceptionon datatype operation errors std::exceptionon general errorsThe function uses unlimited datasets to allow for dynamic growth Memory is managed efficiently using Eigen’s mapping capabilities Performance considerations:Uses block-wise reading and writing for memory efficiencyImplements Eigen for fast matrix operationsAutomatically extends dataset size as needed

6 Call Graph

Function dependencies

7 Source Code

File: inst/include/hdf5Utilities/hdf5Methods.hppLines 72-178

int join_datasets ( T* dsJoined, std::string strsubgroup, Rcpp::StringVector strinput, bool bremoveJoined, bool byCols )
    {
        static_assert(std::is_same<T*, BigDataStatMeth::hdf5Dataset* >::value || 
                      std::is_same<T*, BigDataStatMeth::hdf5DatasetInternal* >::value,
                      "Error - type not allowed");

        try{
            
            H5::Exception::dontPrint();
            
            std::vector<hsize_t> stride = {1, 1},
                                 block = {1, 1},
                                 offset = {0, 0},
                                 count = {0, 0};
            
            std::string stroutdataset = dsJoined->getDatasetName();
            
            
            
            BigDataStatMeth::hdf5Dataset* dstoJoin = new hdf5Dataset(dsJoined->getFullPath(), strsubgroup, Rcpp::as<std::string>(strinput[0]), false);
            dstoJoin->openDataset();
            
            hsize_t* dims_out = dstoJoin->dim();
            
            // Add rows and needed cols to add the merged data in the new dataset
            dsJoined->createUnlimitedDataset( (unsigned long long)dims_out[0], (unsigned long long)dims_out[1], "real");
            dsJoined->openDataset();

            // Read data to merge
            std::vector<double> vreadeddata( dims_out[0] * dims_out[1] ); 
            dstoJoin->readDatasetBlock( {offset[0], offset[1]}, {dims_out[0], dims_out[1]}, stride, block, vreadeddata.data() );
            {
                Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> readedData = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> (vreadeddata.data(),  dims_out[0], dims_out[1] );
                count[0] = dims_out[0]; count[1] = dims_out[1];
                
                // Write data to the new dataset
                dsJoined->writeDatasetBlock( Rcpp::wrap(readedData), offset, count, stride, block, false);
            }
            
            
            delete dstoJoin; // Remove original dataset link
            // Update offset to new position
            offset[1] = offset[1] + dims_out[1];
            
            for( int i=1; i<strinput.size(); i++)
            {
                
                dstoJoin = new hdf5Dataset(dsJoined->getFullPath(), strsubgroup, Rcpp::as<std::string>(strinput[i]), false);
                dstoJoin->openDataset();
                dims_out = dstoJoin->dim();
                
                // Extend dataset before put data
                dsJoined->extendUnlimitedDataset( (unsigned long long)0, (unsigned long long)dims_out[1] );
                
                // Read data to merge
                std::vector<double> vreadeddata( dims_out[0] * dims_out[1] ); 
                dstoJoin->readDatasetBlock( {0, 0}, {dims_out[0], dims_out[1]}, stride, block, vreadeddata.data() );
                Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> readedData = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> (vreadeddata.data(),  dims_out[0], dims_out[1] );
                
                delete dstoJoin;
                
                count[0] = dims_out[0]; count[1] = dims_out[1];
                
                // Write data to the new dataset
                dsJoined->writeDatasetBlock( Rcpp::wrap(readedData), offset, count, stride, block, false);
                
                // Update offset
                offset[1] = offset[1] + dims_out[1];
            }
            
            if(bremoveJoined == true) {
                // Remove joined elements
                BigDataStatMeth::remove_elements(dsJoined->getFileptr(), strsubgroup, strinput);    
            }
            
            
        } catch(H5::FileIException& error) { // catch failure caused by the H5File operations
            checkClose_file(dsJoined);
            Rcpp::Rcerr<<"c++ exception join_datasets (File IException)" << std::endl;
            return -1;
        } catch(H5::DataSetIException& error) { // catch failure caused by the DataSet operations
            checkClose_file(dsJoined);
            Rcpp::Rcerr<<"c++ exception join_datasets (DataSet IException)" << std::endl;
            return -1;
        } catch(H5::GroupIException& error) { // catch failure caused by the Group operations
            checkClose_file(dsJoined);
            Rcpp::Rcerr<<"c++ exception join_datasets (Group IException)" << std::endl;
            return -1;
        } catch(H5::DataSpaceIException& error) { // catch failure caused by the DataSpace operations
            checkClose_file(dsJoined);
            Rcpp::Rcerr<<"c++ exception join_datasets (DataSpace IException)" << std::endl;
            return -1;
        } catch(H5::DataTypeIException& error) { // catch failure caused by the DataSpace operations
            checkClose_file(dsJoined);
            Rcpp::Rcerr<<"c++ exception join_datasets (Data TypeIException)" << std::endl;
            return -1;
        } catch(std::exception &ex) {
            checkClose_file(dsJoined);
            Rcpp::Rcerr << "c++ exception join_datasets: " << ex.what();
            return -1;
        } catch (...) {
            checkClose_file(dsJoined);
            Rcpp::Rcerr<<"C++ exception join_datasets (unknown reason)";
            return -1;
        }
        return(0);
    }

8 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = join_datasets(...);