get_HDF5_mean_sd_by_column

C++ Function Reference

1 Signature

void BigDataStatMeth::get_HDF5_mean_sd_by_column(BigDataStatMeth::hdf5Dataset *dsA, Eigen::MatrixXd &normalize, bool bsd, bool bmean, Rcpp::Nullable< int > wsize)

2 Description

Calculate column-wise mean and standard deviation.

3 Parameters

  • dsA (BigDataStatMeth::hdf5Dataset *): Input matrix dataset
  • normalize (Eigen::MatrixXd &): Output matrix for mean and std values
  • bsd (bool): compute sd
  • bmean (bool): compute mean
  • wsize (Rcpp::Nullable< int >): Block size for processing

4 Details

Computes mean and standard deviation for each column of the matrix using block-based processing for memory efficiency. Optimized for cases where n << m (rows much fewer than columns).

5 Call Graph

Function dependencies

6 Source Code

File: inst/include/hdf5Algebra/matrixSdMean.hppLines 175-245

inline void get_HDF5_mean_sd_by_column( BigDataStatMeth::hdf5Dataset* dsA,
                                        Eigen::MatrixXd& normalize, 
                                        bool bsd, bool bmean, 
                                        Rcpp::Nullable<int> wsize )
{
    
    // IntegerVector dims_out = get_HDF5_dataset_size(*dataset);
    
    try
    {

        hsize_t block_size = 0;
        hsize_t* dims_out = dsA->dim();
        
        std::vector<hsize_t> stride = {1, 1},
                             block = {1, 1},
                             offset = {0, 0},
                             count = {0, 0};
        
        
        block_size = get_block_size(wsize, dims_out[1], dims_out[0]);

        count[1] = dims_out[1];
        if( block_size < dims_out[0] )
            count[0] = block_size;
        else
            count[0] = dims_out[0];
        
        // Read data in blocks of 500 columns
        for(hsize_t i=0; (i <= floor(dims_out[0]/block_size)) || i==0; i++)
        {

            if( offset[0] + block_size <= dims_out[0] ) {
                count[0] = block_size;
            }else {
                count[0] = dims_out[0] - offset[0];
            }
            
            std::vector<double> vdA( count[0] * count[1] ); 
            dsA->readDatasetBlock( {offset[0], offset[1]}, {count[0], count[1]}, stride, block, vdA.data() );
            Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> X (vdA.data(), count[0], count[1] );

            Eigen::VectorXd mean = X.rowwise().mean();
            normalize.block( 0, offset[0], 1, mean.size()) = mean.transpose();
            
            if(bsd) {
                Eigen::VectorXd sd = ((X.colwise() - mean).array().square().rowwise().sum() / (X.cols() - 1)).sqrt();
                normalize.block( 1, offset[0], 1, sd.size()) = sd.transpose();
            }
            
            offset[0] = offset[0] + block_size;

        }
        
    } catch( H5::FileIException& error ) { // catch failure caused by the H5File operations
        // error.printErrorStack();
        checkClose_file(dsA);
        Rf_error("c++ exception get_HDF5_mean_sd_by_column (File IException)");
    } catch( H5::DataSetIException& error ) { // catch failure caused by the DataSet operations
        // error.printErrorStack();
        checkClose_file(dsA);
        Rf_error("c++ exception get_HDF5_mean_sd_by_column (DataSet IException)");
    } catch(std::exception& error) {
        checkClose_file(dsA);
        Rf_error("c++ exception get_HDF5_mean_sd_by_column function: %s",error.what());
        // return void();
    }
    
    return void();  // successfully terminated
    
}

7 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = get_HDF5_mean_sd_by_column(...);