RcppbdSVD_hdf5_Block

C++ Function Reference

1 Signature

void BigDataStatMeth::RcppbdSVD_hdf5_Block(BigDataStatMeth::hdf5Dataset *dsA, BigDataStatMeth::hdf5Dataset *dsu, BigDataStatMeth::hdf5Dataset *dsv, BigDataStatMeth::hdf5Dataset *dsd, int k, int q, int nev, bool bcenter, bool bscale, int irows, int icols, double dthreshold, Rcpp::Nullable< int > threads=R_NilValue)

2 Description

Block-wise SVD decomposition for large HDF5 matrices.

3 Parameters

  • dsA (BigDataStatMeth::hdf5Dataset *): Input HDF5 dataset containing the matrix
  • dsu (BigDataStatMeth::hdf5Dataset *): Output HDF5 dataset for left singular vectors (U)
  • dsv (BigDataStatMeth::hdf5Dataset *): Output HDF5 dataset for right singular vectors (V)
  • dsd (BigDataStatMeth::hdf5Dataset *): Output HDF5 dataset for singular values (S)
  • k (int): Number of local SVDs to concatenate at each level
  • q (int): Number of decomposition levels
  • nev (int): Number of eigenvalues per block
  • bcenter (bool): Whether to center the data
  • bscale (bool): Whether to scale the data
  • irows (int): Number of rows in input matrix
  • icols (int): Number of columns in input matrix
  • dthreshold (double): Threshold for numerical computations
  • threads (Rcpp::Nullable< int >): Number of parallel threads (optional)

4 Details

Performs SVD computation on large matrices using block-wise decomposition strategy. This function handles matrices too large to fit in memory by processing them in blocks and combining results.

5 Call Graph

Function dependencies

6 Source Code

File: inst/include/hdf5Algebra/matrixSvd.hppLines 171-332

inline void RcppbdSVD_hdf5_Block( BigDataStatMeth::hdf5Dataset* dsA, 
                                      BigDataStatMeth::hdf5Dataset* dsu, BigDataStatMeth::hdf5Dataset* dsv, 
                                      BigDataStatMeth::hdf5Dataset* dsd, int k, int q, int nev, bool bcenter, bool bscale, 
                                      int irows, int icols, double dthreshold, Rcpp::Nullable<int> threads = R_NilValue )
    {
        
        BigDataStatMeth::hdf5Dataset* dsnormalizedData = nullptr;
        BigDataStatMeth::hdf5Dataset* dsJoined = nullptr;
        BigDataStatMeth::hdf5DatasetInternal* dsnormalizedData_i = nullptr;
        
        try{
            
            std::vector<hsize_t> stride = {1, 1},
                block = {1, 1};
            
            svdeig retsvd;
            // Eigen::MatrixXd nX;
            Eigen::MatrixXd matlast;
            Eigen::MatrixXd v;    
            bool transp = false;
            std::string strGroupName  = "tmpgroup",
                strPrefix; // Outpu group name for temporal data
            
            Rcpp::CharacterVector strvmatnames = {"A","B","C","D","E","F","G","H","I","J","K",
                                                  "L","M","N","O","P","Q","R","S","T","U","V",
                                                  "W","X","Y","Z"};
            strPrefix = strvmatnames[q-1];
            
            if(irows >= icols) {
                transp = true;
            }
            
            First_level_SvdBlock_decomposition_hdf5( dsA, strGroupName, k, q, nev, bcenter, bscale, dthreshold, threads);
            
            for(int j = 1; j < q; j++) { // For each decomposition level :
                Next_level_SvdBlock_decomposition_hdf5( dsA, strGroupName, k, j, dthreshold, threads);
            }
            
            // Get dataset names
            Rcpp::StringVector joindata =  dsA->getDatasetNames(strGroupName, strPrefix, "");
            
            // 1.- Join matrix and remove parts from file
            std::string strnewdataset = std::string((joindata[0])).substr(0,1);
            
            dsJoined = new hdf5DatasetInternal(dsA->getFullPath(), strGroupName, strnewdataset, true);
            
            join_datasets( dsJoined, strGroupName, joindata, false, true );
            
            // 2.- Get SVD from Blocks full mattrix
            hsize_t* dims_out = dsJoined->dim();
            
            std::vector<double> vdreaded( dims_out[0] * dims_out[1] );
            dsJoined->readDatasetBlock( {0, 0}, {dims_out[0], dims_out[1]}, {1, 1}, {1, 1}, vdreaded.data() );
            
            matlast = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> (vdreaded.data(), dims_out[0], dims_out[1] );
            delete dsJoined; dsJoined = nullptr;
            
            retsvd = RcppbdSVD_lapack(matlast, false, false, false);
            
            
            // Write results to hdf5 file : in folder "SVD" and dataset "SVD".<name input dataset>
            dsd->createDataset( 1, retsvd.d.size(), "real");
            dsd->writeDataset( Rcpp::wrap(retsvd.d) );
            
            // 3.- crossprod initial matrix and svdA$u
            
            if( bcenter == true || bscale == true || (dsA->getGroupName().find("NORMALIZED_T") != std::string::npos) ) {
                
                if(bcenter == true || bscale == true) {
                    
                    dsnormalizedData = new BigDataStatMeth::hdf5Dataset( dsA->getFullPath(), strGroupName, "normalmatrix", false);
                    dsnormalizedData->openDataset();
                    
                    dims_out = dsnormalizedData->dim();
                    
                    Eigen::MatrixXd A = Eigen::MatrixXd::Zero(dims_out[1], dims_out[0]);
                    dsnormalizedData->readDatasetBlock( {0, 0}, {dims_out[0], dims_out[1]}, stride, block, A.data() );
                    
                    delete dsnormalizedData; dsnormalizedData = nullptr;
                    
                    if(transp == false) {
                        v = A * retsvd.u;
                    } else {
                        v = A.transpose() * retsvd.u;
                        // v = Rcpp_block_matrix_mul_parallel(A,v retsvd.u, false, false, R_NilValue, threads); // multiplication
                    }
                    
                    // v = Rcpp_block_matrix_mul_parallel(A, retsvd.u, false, false, R_NilValue, threads);
                    
                } else {
                    
                    dsnormalizedData_i = new BigDataStatMeth::hdf5DatasetInternal( dsA->getFullPath(), dsA->getGroupName(), dsA->getDatasetName(), false);
                    dsnormalizedData_i->openDataset();
                    
                    dims_out = dsnormalizedData_i->dim();
                    
                    std::vector<double> vdA( dims_out[0] * dims_out[1] );
                    dsnormalizedData_i->readDatasetBlock( {0, 0}, {dims_out[0], dims_out[1]}, stride, block, vdA.data() );
                    Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A (vdA.data(), dims_out[0], dims_out[1]);
                    delete dsnormalizedData_i; dsnormalizedData_i= nullptr;
                    
                    v = Rcpp_block_matrix_mul_parallel(A, retsvd.u, false, false, R_NilValue, threads);
                    
                }
                
            } else {
                
                dims_out = dsA->dim();
                Eigen::MatrixXd A;
                std::vector<double> vdA( dims_out[0] * dims_out[1] );
                dsA->readDatasetBlock( {0,0}, {dims_out[0], dims_out[1] }, stride, block, vdA.data() );
                
                A = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>> (vdA.data(), dims_out[1], dims_out[0] );
                
                if(transp == false) {
                    v = Rcpp_block_matrix_mul_parallel(A, retsvd.u, true, false, R_NilValue, threads); // crossprod
                } else {
                    v = Rcpp_block_matrix_mul_parallel(A, retsvd.u, false, false, R_NilValue, threads); // multiplication
                }
            }
            
            // 4.- resuls / svdA$d
            // v = v.array().rowwise()/(retsvd.d).transpose().array();
            v = v.array().rowwise() / Eigen::Map<Eigen::RowVectorXd>(retsvd.d.data(), (retsvd.d).size()).array();
            
            if (transp == true)  {
                dsu->createDataset( v.rows(), v.cols(), "real");
                dsv->createDataset( retsvd.u.rows(), retsvd.u.cols(), "real");
                
                dsu->writeDataset(Rcpp::wrap(v));
                dsv->writeDataset(Rcpp::wrap(retsvd.u));
            } else {
                dsu->createDataset( retsvd.u.rows(), retsvd.u.cols(), "real");
                dsv->createDataset( v.rows(), v.cols(), "real");
                
                dsu->writeDataset(Rcpp::wrap(retsvd.u));
                dsv->writeDataset(Rcpp::wrap(v));
            }
            
            remove_elements(dsA->getFileptr(), strGroupName);
            
            
        }  catch( H5::FileIException& error ) { // catch failure caused by the H5File operations
            checkClose_file(dsA, dsd, dsu, dsv, dsnormalizedData, dsJoined, dsnormalizedData_i);
            Rcpp::Rcerr<<"\nc++ exception RcppbdSVD_hdf5_Block (File IException)\n";
            return void();
        } catch( H5::DataSetIException& error ) { // catch failure caused by the DataSet operations
            checkClose_file(dsA, dsd, dsu, dsv, dsnormalizedData, dsJoined, dsnormalizedData_i);
            Rcpp::Rcerr<<"\nc++ exception RcppbdSVD_hdf5_Block (DataSet IException)\n";
            return void();
        } catch(std::exception &ex) {
            checkClose_file(dsA, dsd, dsu, dsv, dsnormalizedData, dsJoined, dsnormalizedData_i);
            Rcpp::Rcerr<< "C++ exception RcppbdSVD_hdf5_Block : "<< ex.what();
            return void();
        } catch (...) {
            checkClose_file(dsA, dsd, dsu, dsv, dsnormalizedData, dsJoined, dsnormalizedData_i);
            Rcpp::Rcerr<<"\nC++ exception RcppbdSVD_hdf5_Block (unknown reason)";
            return void();
        }
        
        return void();
    }

7 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = RcppbdSVD_hdf5_Block(...);