RcppbdSVD_hdf5_Block

C++ Function Reference

1 Signature

void BigDataStatMeth::RcppbdSVD_hdf5_Block(BigDataStatMeth::hdf5Dataset *dsA, BigDataStatMeth::hdf5Dataset *dsu, BigDataStatMeth::hdf5Dataset *dsv, BigDataStatMeth::hdf5Dataset *dsd, int k, int q, int nev, bool bcenter, bool bscale, int irows, int icols, double dthreshold, Rcpp::Nullable< int > threads=R_NilValue)

2 Description

Block-wise SVD decomposition for large HDF5 matrices.

3 Parameters

  • dsA (BigDataStatMeth::hdf5Dataset *): Input HDF5 dataset containing the matrix
  • dsu (BigDataStatMeth::hdf5Dataset *): Output HDF5 dataset for left singular vectors (U)
  • dsv (BigDataStatMeth::hdf5Dataset *): Output HDF5 dataset for right singular vectors (V)
  • dsd (BigDataStatMeth::hdf5Dataset *): Output HDF5 dataset for singular values (S)
  • k (int): Number of local SVDs to concatenate at each level
  • q (int): Number of decomposition levels
  • nev (int): Number of eigenvalues per block
  • bcenter (bool): Whether to center the data
  • bscale (bool): Whether to scale the data
  • irows (int): Number of rows in input matrix
  • icols (int): Number of columns in input matrix
  • dthreshold (double): Threshold for numerical computations
  • threads (Rcpp::Nullable< int >): Number of parallel threads (optional)

4 Details

Performs SVD computation on large matrices using block-wise decomposition strategy. This function handles matrices too large to fit in memory by processing them in blocks and combining results.

5 Call Graph

Function dependencies

6 Source Code

File: inst/include/hdf5Algebra/matrixSvd.hppLines 239-441

inline void RcppbdSVD_hdf5_Block( BigDataStatMeth::hdf5Dataset* dsA, 
                                      BigDataStatMeth::hdf5Dataset* dsu, BigDataStatMeth::hdf5Dataset* dsv, 
                                      BigDataStatMeth::hdf5Dataset* dsd, int k, int q, int nev, bool bcenter, bool bscale, 
                                      int irows, int icols, double dthreshold, Rcpp::Nullable<int> threads = R_NilValue )
    {
        
        try{

            std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsnormalizedData(nullptr);
            std::unique_ptr<BigDataStatMeth::hdf5DatasetInternal> dsJoined(nullptr);
            std::unique_ptr<BigDataStatMeth::hdf5DatasetInternal> dsnormalizedData_i(nullptr);
            
            std::vector<hsize_t> stride = {1, 1},
                block = {1, 1};
            
            svdeig retsvd;
            // Eigen::MatrixXd nX;
            Eigen::MatrixXd matlast;
            Eigen::MatrixXd v;    
            bool transp = false;
            std::string strGroupName  = "tmpgroup",
                        strPrefix; // Outpu group name for temporal data
            
            Rcpp::CharacterVector strvmatnames = {"A","B","C","D","E","F","G","H","I","J","K",
                                                  "L","M","N","O","P","Q","R","S","T","U","V",
                                                  "W","X","Y","Z"};
            strPrefix = strvmatnames[q-1];
            
            if(irows >= icols) {
                transp = true;
            }
            
            First_level_SvdBlock_decomposition_hdf5( dsA, strGroupName, k, q, nev, bcenter, bscale, dthreshold, threads);
            for(int j = 1; j < q; j++) { // For each decomposition level :
                Next_level_SvdBlock_decomposition_hdf5( dsA, strGroupName, k, j, dthreshold, threads);
            }
            
            // Get dataset names
            Rcpp::StringVector joindata =  dsA->getDatasetNames(strGroupName, strPrefix, "");
            
            // 1.- Join matrix and remove parts from file
            std::string strnewdataset = std::string((joindata[0])).substr(0,1);
            
            // dsJoined = new hdf5DatasetInternal(dsA->getFullPath(), strGroupName, strnewdataset, true);
            dsJoined.reset( new BigDataStatMeth::hdf5DatasetInternal(dsA->getFullPath(), strGroupName, strnewdataset, true) );
            
            join_datasets( dsJoined.get(), strGroupName, joindata, false, true );
            
            // 2.- Get SVD from Blocks full mattrix
            hsize_t* dims_out = dsJoined->dim();
            
            std::vector<double> vdreaded( dims_out[0] * dims_out[1] );
            dsJoined->readDatasetBlock( {0, 0}, {dims_out[0], dims_out[1]}, {1, 1}, {1, 1}, vdreaded.data() );
            
            matlast = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> (vdreaded.data(), dims_out[0], dims_out[1] );
            // delete dsJoined; dsJoined = nullptr;
            
            retsvd = RcppbdSVD_lapack(matlast, false, false, false);
            
            // Write results to hdf5 file : in folder "SVD" and dataset "SVD".<name input dataset>
            dsd->inheritCompressionLevel(dsA->getCompressionLevel());
            dsd->createDataset( 1, retsvd.d.size(), "real");
            dsd->writeDataset( Rcpp::wrap(retsvd.d) );
            
            // 3.- Compute v = source * (u / d) block-wise — never loads full matrix into RAM
            {
                // Pre-divide u by d (tiny operation, u is n_small × k)
                Eigen::MatrixXd u_div_d = retsvd.u.array().rowwise() /
                    Eigen::Map<Eigen::RowVectorXd>(retsvd.d.data(), retsvd.d.size()).array();
                
                // Write u_div_d to a small temp HDF5 dataset
                std::unique_ptr<BigDataStatMeth::hdf5Dataset> ds_udivd(nullptr);
                ds_udivd.reset( new BigDataStatMeth::hdf5Dataset(dsA->getFullPath(), strGroupName, "udivd", true) );
                ds_udivd->setCompressionLevel(0);
                ds_udivd->createDataset( u_div_d.rows(), u_div_d.cols(), "real" );
                ds_udivd->writeDataset( Rcpp::wrap(u_div_d) );
                
                // Temp dataset for the product result (also small: n_small × k)
                std::unique_ptr<BigDataStatMeth::hdf5Dataset> ds_vtmp(nullptr);
                ds_vtmp.reset( new BigDataStatMeth::hdf5Dataset(dsA->getFullPath(), strGroupName, "vtmp", true) );
                
                // if( bcenter == true || bscale == true ) {
                //     dsnormalizedData.reset( new BigDataStatMeth::hdf5Dataset( dsA->getFullPath(), strGroupName, "normalmatrix", false) );
                //     dsnormalizedData->openDataset();
                //     
                //     multiplication( dsnormalizedData.get(), ds_udivd.get(), ds_vtmp.get(),
                //                     transp, false, R_NilValue, R_NilValue, threads );
                // } else if( dsA->getGroupName().find("NORMALIZED_T") != std::string::npos ) {
                //     dsnormalizedData_i.reset( new BigDataStatMeth::hdf5DatasetInternal( dsA->getFullPath(), dsA->getGroupName(), dsA->getDatasetName(), false) );
                //     dsnormalizedData_i->openDataset();
                //     multiplication( dsnormalizedData_i.get(), ds_udivd.get(), ds_vtmp.get(),
                //                     false, false, R_NilValue, R_NilValue, threads );
                // } else {
                //     multiplication( dsA, ds_udivd.get(), ds_vtmp.get(),
                //                     !transp, false, R_NilValue, R_NilValue, threads );
                // }
                // 
                // // Read result back — it is n_small × k, always fits in RAM
                // hsize_t* vdims = ds_vtmp->dim();
                // std::vector<double> vdbuf( vdims[0] * vdims[1] );
                // ds_vtmp->readDatasetBlock( {0,0}, {vdims[0], vdims[1]}, stride, block, vdbuf.data() );
                // v = Eigen::Map<Eigen::MatrixXd>( vdbuf.data(), vdims[1], vdims[0] );
                
                
                if( bcenter == true || bscale == true ) {
                    dsnormalizedData.reset( new BigDataStatMeth::hdf5Dataset( dsA->getFullPath(), strGroupName, "normalmatrix", false) );
                    dsnormalizedData->openDataset();
                    
                    if( transp == false ) {
                        // normalmatrix dims [R_nrows, R_ncols] — multiplication() works correctly here
                        multiplication( dsnormalizedData.get(), ds_udivd.get(), ds_vtmp.get(),
                                        false, false, R_NilValue, R_NilValue, threads );
                    } else {
                        
                        // transp=true: multiplication() reads out of bounds for this storage convention.
                        // v = normalmatrix_R * u_div_d = (R_nrows × R_ncols) * (R_ncols × k).
                        // Read normalmatrix in row-blocks — never fully in RAM.
                        const hsize_t nR  = dsnormalizedData->nrows();  // R_nrows
                        const hsize_t nC  = dsnormalizedData->ncols();  // R_ncols
                        const hsize_t blk = std::max( (hsize_t)1, MAXELEMSINBLOCK / nC );
                        v = Eigen::MatrixXd::Zero( nR, u_div_d.cols() );
                        for( hsize_t off = 0; off < nR; off += blk ) {
                            const hsize_t cnt = std::min( blk, nR - off );
                            std::vector<double> buf( cnt * nC );
                            dsnormalizedData->readDatasetBlock( {off, 0}, {cnt, nC}, stride, block, buf.data() );
                            Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
                                Ablock( buf.data(), cnt, nC );
                            v.middleRows( off, cnt ).noalias() = Ablock * u_div_d;
                        }
                    }
                } else if( dsA->getGroupName().find("NORMALIZED_T") != std::string::npos ) {
                    
                    // hdf5Dataset ==> Mirall plot
                    dsnormalizedData_i.reset( new BigDataStatMeth::hdf5DatasetInternal( dsA->getFullPath(), dsA->getGroupName(), dsA->getDatasetName(), false) );
                    dsnormalizedData_i->openDataset();
                    
                    // hsize_t* tmp_dim = dsnormalizedData_i->dim();
                    
                    // std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsnormalizedData_ni(nullptr);
                    // dsnormalizedData_ni.reset( new BigDataStatMeth::hdf5Dataset( dsA->getFullPath(), dsA->getGroupName(), dsA->getDatasetName(), false) );
                    // dsnormalizedData_ni->openDataset();
                    // 
                    // tmp_dim = dsnormalizedData_ni->dim();
                    
                    multiplication( dsnormalizedData_i.get(), ds_udivd.get(), ds_vtmp.get(), true, false, R_NilValue, R_NilValue, threads );
                } else {
                    
                    multiplication( dsA, ds_udivd.get(), ds_vtmp.get(),
                                    !transp, false, R_NilValue, R_NilValue, threads );
                }
                
                // Read result back from ds_vtmp — only when populated by multiplication()
                if( !( (bcenter == true || bscale == true) && transp == true ) ) {
                    hsize_t* vdims = ds_vtmp->dim();
                    std::vector<double> vdbuf( vdims[0] * vdims[1] );
                    ds_vtmp->readDatasetBlock( {0,0}, {vdims[0], vdims[1]}, stride, block, vdbuf.data() );
                    v = Eigen::Map<Eigen::MatrixXd>( vdbuf.data(), vdims[1], vdims[0] );
                }
                
                
            }

            // 4.- resuls / svdA$d
            // v = v.array().rowwise()/(retsvd.d).transpose().array();
            //..  2026/05/03 ..//v = v.array().rowwise() / Eigen::Map<Eigen::RowVectorXd>(retsvd.d.data(), (retsvd.d).size()).array();
            
            if (transp == true)  {
                dsu->inheritCompressionLevel(dsA->getCompressionLevel());
                dsu->createDataset( v.rows(), v.cols(), "real");
                dsv->inheritCompressionLevel(dsA->getCompressionLevel());
                dsv->createDataset( retsvd.u.rows(), retsvd.u.cols(), "real");
                
                dsu->writeDataset(Rcpp::wrap(v));
                dsv->writeDataset(Rcpp::wrap(retsvd.u));
            } else {
                dsu->inheritCompressionLevel(dsA->getCompressionLevel());
                dsu->createDataset( retsvd.u.rows(), retsvd.u.cols(), "real");
                dsv->inheritCompressionLevel(dsA->getCompressionLevel());
                dsv->createDataset( v.rows(), v.cols(), "real");
                
                dsu->writeDataset(Rcpp::wrap(retsvd.u));
                dsv->writeDataset(Rcpp::wrap(v));
            }
            
            remove_elements(dsA->getFileptr(), strGroupName);
            
            
        }  catch( H5::FileIException& error ) { // catch failure caused by the H5File operations
            // checkClose_file(dsA, dsd, dsu, dsv, dsnormalizedData, dsJoined, dsnormalizedData_i);
            throw std::runtime_error("c++ exception RcppbdSVD_hdf5_Block (File IException)");
        } catch( H5::DataSetIException& error ) { // catch failure caused by the DataSet operations
            // checkClose_file(dsA, dsd, dsu, dsv, dsnormalizedData, dsJoined, dsnormalizedData_i);
            throw std::runtime_error("c++ exception RcppbdSVD_hdf5_Block (DataSet IException)");
        } catch(std::exception &ex) {
            // checkClose_file(dsA, dsd, dsu, dsv, dsnormalizedData, dsJoined, dsnormalizedData_i);
            throw std::runtime_error(std::string("C++ exception RcppbdSVD_hdf5_Block: ") + ex.what());
        } catch (...) {
            // checkClose_file(dsA, dsd, dsu, dsv, dsnormalizedData, dsJoined, dsnormalizedData_i);
            throw std::runtime_error("C++ exception RcppbdSVD_hdf5_Block (unknown reason)");
        }
        
        return void();
    }

7 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = RcppbdSVD_hdf5_Block(...);