RcppSplit_matrix_hdf5_internal

C++ Function Reference

1 Signature

void BigDataStatMeth::RcppSplit_matrix_hdf5_internal(BigDataStatMeth::hdf5Dataset *dstosplit, std::string stroutgroup, std::string stroutdataset, bool bycols, int nblocks, int iblocksize, int irows, int icols)

2 Description

Splits an HDF5 dataset into multiple smaller datasets (internal C++ interface)

3 Parameters

  • dstosplit (BigDataStatMeth::hdf5Dataset *): Input dataset to split
  • stroutgroup (std::string): Output group path
  • stroutdataset (std::string): Base name for output datasets
  • bycols (bool): Whether to split by columns (true) or rows (false)
  • nblocks (int): Number of blocks to create (if > 0)
  • iblocksize (int): Size of each block (if > 0)
  • irows (int): Number of rows in input dataset
  • icols (int): Number of columns in input dataset

4 Details

dstosplitInput dataset to split stroutgroupOutput group path stroutdatasetBase name for output datasets bycolsWhether to split by columns (true) or rows (false) nblocksNumber of blocks to create (if > 0) iblocksizeSize of each block (if > 0) irowsNumber of rows in input dataset icolsNumber of columns in input dataset Implementation features:Flexible block specification:By number of blocks (nblocks > 0)By block size (iblocksize > 0) Automatic block size calculationEfficient memory usageError checking for parameters

5 Call Graph

Function dependencies

6 Source Code

File: inst/include/hdf5Utilities/hdf5SplitDataset.hppLines 196-299

inline void RcppSplit_matrix_hdf5_internal ( BigDataStatMeth::hdf5Dataset* dstosplit,
                                   std::string stroutgroup, std::string stroutdataset,
                                   bool bycols, int nblocks, int iblocksize, 
                                   int irows, int icols )
{
        
    BigDataStatMeth::hdf5Dataset* dsOut = nullptr;
        
    try {
        
        std::string newDatasetName = "";
        hsize_t inrows = irows, 
                incols = icols,
                ii = 0,
                kk = 0;
        int blocks = nblocks, 
            blocksize = iblocksize;;
        std::vector<hsize_t> stride = {1, 1},
                             block = {1, 1};
        
        if( nblocks <= 0 && iblocksize <= 0 ){
            checkClose_file(dstosplit);
            Rf_error( "\n Block size or number of blocks are needed to proceed with matrix split. Please, review parameters");
            return void();
            
        } else if (nblocks > 0 && iblocksize > 0 ) {
            checkClose_file(dstosplit);
            Rf_error( "\nBlock size and number of blocks are defined, please define only one option, split by number of blocks or by block size");
            return void();
            
        } else if( nblocks > 0 ) {
            double module;
            
            if(bycols == true) {
                blocksize = icols / blocks;
                module = icols % blocksize;
            } else {
                blocksize = irows / blocks;
                module = irows % blocksize;
            }
            if (module > 0) { blocksize = blocksize + 1; }
        } 
        
        if(blocksize > 0) {
            
            if( bycols == true ) {
                blocks = (icols + blocksize - 1) / blocksize;
                incols = blocksize;
            } else {
                blocks = (irows + blocksize - 1) / blocksize;
                inrows = blocksize;
            }    
        }
        
        for ( int i=0; i<blocks; i++)
        {
            newDatasetName = stroutgroup + "/" + stroutdataset + "." + std::to_string(i);
            
            if( bycols == true) {
                kk = i * blocksize;
                if( kk + static_cast<hsize_t>(blocksize) > static_cast<hsize_t>(icols)) { 
                    incols = static_cast<hsize_t>(icols) - kk; 
                }
            } else  {
                ii = i * blocksize;
                if( ii + static_cast<hsize_t>(blocksize) > static_cast<hsize_t>(irows)) { 
                    inrows = static_cast<hsize_t>(irows) - ii; 
                }
            }
    
            std::vector<double> vdts( inrows * incols );
            dstosplit->readDatasetBlock( {ii, kk}, {inrows, incols}, stride, block, vdts.data() );
            
            dsOut = new BigDataStatMeth::hdf5Dataset(dstosplit->getFileName(), newDatasetName, true);
            dsOut->createDataset(  incols, inrows, "real"); 
            
            if( dsOut->getDatasetptr() != nullptr ){
                dsOut->writeDataset(vdts.data());
            }
            
            delete dsOut; dsOut = nullptr;
            
        }
        
    } catch( H5::FileIException& error ) {
        checkClose_file(dstosplit, dsOut);
        Rf_error( "c++ exception RcppSplit_matrix_hdf5_internal(File IException )");
    } catch( H5::DataSetIException& error ) { 
        checkClose_file(dstosplit, dsOut);
        Rf_error( "c++ exception RcppSplit_matrix_hdf5_internal (DataSet IException )");
    } catch( H5::DataSpaceIException& error ) { 
        checkClose_file(dstosplit, dsOut);
        Rf_error( "c++ exception RcppSplit_matrix_hdf5_internal (DataSpace IException )");
    } catch(std::exception &ex) {
        checkClose_file(dstosplit, dsOut);
        Rf_error( "\nC++ exception RcppSplit_matrix_hdf5_internal : %s",ex.what());
    } catch (...) {
        checkClose_file(dstosplit, dsOut);
        Rf_error( "\nC++ exception RcppSplit_matrix_hdf5_internal (unknown reason)");
    }
    
    return void();
    
}

7 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = RcppSplit_matrix_hdf5_internal(...);