RcppSplit_matrix_hdf5

C++ Function Reference

1 Signature

void BigDataStatMeth::RcppSplit_matrix_hdf5(BigDataStatMeth::hdf5Dataset *dstosplit, bool bycols, std::string stroutgroup, std::string stroutdataset, int blocksize, int irows, int icols)

2 Description

Splits an HDF5 dataset into multiple smaller datasets (R interface)

3 Parameters

  • dstosplit (BigDataStatMeth::hdf5Dataset *): Input dataset to split
  • bycols (bool): Whether to split by columns (true) or rows (false)
  • stroutgroup (std::string): Output group path
  • stroutdataset (std::string): Base name for output datasets
  • blocksize (int): Size of each block
  • irows (int): Number of rows in input dataset
  • icols (int): Number of columns in input dataset

4 Details

dstosplitInput dataset to split bycolsWhether to split by columns (true) or rows (false) stroutgroupOutput group path stroutdatasetBase name for output datasets blocksizeSize of each block irowsNumber of rows in input dataset icolsNumber of columns in input dataset Implementation approach:Calculates number of blocks based on input dimensionsProcesses dataset in blocks:Reads block from input datasetCreates new dataset for blockWrites block to new dataset Handles edge cases for final blocks

5 Call Graph

Function dependencies

6 Source Code

File: inst/include/hdf5Utilities/hdf5SplitDataset.hppLines 65-140

inline void RcppSplit_matrix_hdf5 ( BigDataStatMeth::hdf5Dataset* dstosplit, bool bycols, 
                                std::string stroutgroup, std::string stroutdataset, 
                                int blocksize, int irows, int icols )
{
        
    BigDataStatMeth::hdf5Dataset* dsOut = nullptr;
        
    try {
        
        int blocks;
        hsize_t inrows = irows, 
            incols = icols,
            ii = 0,
            kk = 0;
        
        std::vector<hsize_t> stride = {1, 1},
            block = {1, 1};
        
        std::string newDatasetName = "";
        
        if( bycols == true ) {
            blocks = (icols + blocksize - 1) / blocksize;
            incols = blocksize;
        } else {
            blocks = (irows + blocksize - 1) / blocksize;
            inrows = blocksize;
        }
        
        for ( int i=0; i<blocks; i++)
        {
            newDatasetName = stroutgroup + "/" + stroutdataset + "." + std::to_string(i);
            
            if( bycols == true) { 
                kk = i * blocksize;
                if( kk + static_cast<hsize_t>(blocksize) > static_cast<hsize_t>(icols)) 
                    { incols = static_cast<hsize_t>(icols) - kk; }
            } else  {
                ii = i * blocksize;
                if( ii + static_cast<hsize_t>(blocksize) > static_cast<hsize_t>(irows)) 
                    { inrows = static_cast<hsize_t>(irows) - ii; }
            }
            
            std::vector<double> vdts( inrows * incols );
            dstosplit->readDatasetBlock( {kk, ii}, {incols, inrows}, stride, block, vdts.data() );
                
            dsOut = new BigDataStatMeth::hdf5Dataset(dstosplit->getFileName(), newDatasetName, true);
            dsOut->createDataset( inrows, incols, "real"); 
            
            if( dsOut->getDatasetptr() != nullptr ){
                dsOut->writeDataset(vdts.data());
            }
            
            delete dsOut; dsOut = nullptr;
            
        }
        
    } catch( H5::FileIException& error ) {
        checkClose_file(dstosplit, dsOut);
        Rf_error( "c++ exception RcppSplit_matrix_hdf5(File IException )");
    } catch( H5::DataSetIException& error ) { 
        checkClose_file(dstosplit, dsOut);
        Rf_error( "c++ exception RcppSplit_matrix_hdf5 (DataSet IException )");
    } catch( H5::DataSpaceIException& error ) { 
        checkClose_file(dstosplit, dsOut);
        Rf_error( "c++ exception RcppSplit_matrix_hdf5 (DataSpace IException )");
    } catch(std::exception &ex) {
        checkClose_file(dstosplit, dsOut);
        Rf_error( "\nC++ exception RcppSplit_matrix_hdf5 : %s", ex.what());
    } catch (...) {
        checkClose_file(dstosplit, dsOut);
        Rf_error( "\nC++ exception RcppSplit_matrix_hdf5 (unknown reason)");
    }
    
    return void();
    
}

7 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = RcppSplit_matrix_hdf5(...);