RcppPCAHdf5

C++ Function Reference

1 Signature

void BigDataStatMeth::RcppPCAHdf5(std::string filename, std::string strgroup, std::string strdataset, std::string strSVDgroup, int k, int q, int nev, bool bcenter, bool bscale, double dthreshold, bool bforce, bool asRowMajor, Rcpp::Nullable< Rcpp::CharacterVector > method=R_NilValue, Rcpp::Nullable< int > ithreads=R_NilValue)

2 Description

Perform Principal Component Analysis.

3 Parameters

  • filename (std::string): HDF5 file name
  • strgroup (std::string): Group name for results
  • strdataset (std::string): Dataset name
  • strSVDgroup (std::string): SVD group name
  • k (int): Number of components to compute
  • q (int): Block size for processing
  • nev (int): Number of eigenvalues
  • bcenter (bool): Whether to center the data
  • bscale (bool): Whether to scale the data
  • dthreshold (double): Convergence threshold
  • bforce (bool): Whether to force computation
  • asRowMajor (bool): Whether data is in row-major order
  • method (Rcpp::Nullable< Rcpp::CharacterVector >): Method selection (optional)
  • ithreads (Rcpp::Nullable< int >): Number of threads (optional)

4 Details

Performs PCA on an HDF5 dataset with options for:Full or truncated analysisData preprocessing (centering/scaling)Method selectionParallel processing

5 Call Graph

Function dependencies

6 Source Code

File: inst/include/hdf5Algebra/matrixPCA.hppLines 360-482

inline void RcppPCAHdf5( std::string filename, std::string strgroup, std::string strdataset,  
                             std::string strSVDgroup, int k, int q, int nev, 
                             bool bcenter, bool bscale, double dthreshold, 
                             bool bforce, bool asRowMajor, 
                             Rcpp::Nullable<Rcpp::CharacterVector> method = R_NilValue,
                             Rcpp::Nullable<int> ithreads = R_NilValue)
    {
        
        try{

            H5::Exception::dontPrint();
            
            std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsA(nullptr);
            std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsd(nullptr);
            std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsu(nullptr);
            std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsv(nullptr);
            std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsX(nullptr);
            
            std::string strPCAgroup = "PCA/" + strdataset;
            bool bexistsSVD, bexistsPCA;
                        
                        
            // Check for svd decomposition (u, v and d matrices) in hdf5 file or if we 
            // need to compute again the SVD ( foce = true )
            // BigDataStatMeth::hdf5File* file = new BigDataStatMeth::hdf5File(filename, false);
            {
               std::unique_ptr<BigDataStatMeth::hdf5File> file(nullptr);
                file.reset(  new BigDataStatMeth::hdf5File(filename, false) );
                file->openFile("r");
                
                bexistsSVD = exists_HDF5_element(file->getFileptr(), strSVDgroup);
                bexistsPCA = exists_HDF5_element(file->getFileptr(), strPCAgroup);
            }
            
            // {
            //     // Open via hdf5Dataset (RDWR) — mirrors SVD path.
            //     // Opening RDONLY when the file is already open RDWR in the same
            //     // process fails on macOS; RDWR reuses the existing file ID correctly.
            //     std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsTmp(
            //             new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false));
            //     bexistsSVD = exists_HDF5_element(dsTmp->getFileptr(), strSVDgroup);
            //     bexistsPCA = exists_HDF5_element(dsTmp->getFileptr(), strPCAgroup);
            // }
            
            if( bexistsSVD == 0 ||  bforce == true ) 
            {
            
                // dsA = new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false);
                dsA.reset( new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false) );
                dsA->openDataset();
                if( dsA->getDatasetptr() != nullptr ) {
                    RcppTypifyNormalizeHdf5( dsA.get(), bcenter, bscale, false); // Normalize and tipify data ( ((x-mu)/(sd)) * 1/sqrt(n-1) )
                } else {
                    // checkClose_file(dsA, dsd, dsu, dsv, dsX);
                    return void();
                }
                
                // delete dsA; dsA = nullptr;
                BigDataStatMeth::RcppbdSVD_hdf5( filename, "NORMALIZED_T/" + strgroup, strdataset, k, q, nev, false, false, dthreshold, bforce, asRowMajor, method, ithreads );
                strSVDgroup = "SVD/" +  strdataset;
                
            } else {
                strSVDgroup = strSVDgroup + strdataset;   // 20260227
            }
            
            // Check if PCA decomposition exists
            if( bexistsPCA != 0  && bforce == false) {
                Rcpp::Rcout<<"PCA decomposition exits, please set overwrite = true to overwrite the existing results";
                return void();
            }
            
            // ------------ Variables ----------------
            
            // dsd = new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "d", false );
            dsd.reset( new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "d", false ) );
            dsd->openDataset();
            
            // dsv = new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "v", false );
            dsv.reset( new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "v", false ) );
            dsv->openDataset();
            
            if( dsd->getDatasetptr() != nullptr && dsv->getDatasetptr() != nullptr) {
                RcppGetPCAVariablesHdf5( strPCAgroup, dsd.get(), dsv.get(), bforce );
            }
            
            // delete dsv; dsv = nullptr;
            // delete dsA;
            
            // ------------ Individuals ----------------
            
            // dsX = new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false);
            dsX.reset( new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false) );
            dsX->openDataset();
            
            // dsu = new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "u", false );
            dsu.reset( new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "u", false ) );
            dsu->openDataset();
            
            if( dsX->getDatasetptr() != nullptr && dsd->getDatasetptr() != nullptr && dsu->getDatasetptr() != nullptr) {
                RcppGetPCAIndividualsHdf5( strPCAgroup, dsX.get(), dsd.get(), dsu.get(), bforce );
            }
            
            
        } catch( H5::FileIException& error ) { // catch failure caused by the H5File operations
            // checkClose_file(dsA, dsd, dsu, dsv, dsX);
            throw std::runtime_error("c++ exception RcppPCAHdf5 (File IException)");
        } catch( H5::DataSetIException& error ) { // catch failure caused by the DataSet operations
            // checkClose_file(dsA, dsd, dsu, dsv, dsX);
            throw std::runtime_error("c++ exception RcppPCAHdf5 (DataSet IException)");
        } catch( H5::DataSpaceIException& error ) { // catch failure caused by the DataSpace operations
            // checkClose_file(dsA, dsd, dsu, dsv, dsX);
            throw std::runtime_error("c++ exception RcppPCAHdf5 (DataSpace IException)");
        } catch(std::exception &ex) {
            // checkClose_file(dsA, dsd, dsu, dsv, dsX);
            throw std::runtime_error(std::string("c++ exception RcppPCAHdf5: ") + ex.what());
        } catch (...) {
            // checkClose_file(dsA, dsd, dsu, dsv, dsX);
            throw std::runtime_error("C++ exception RcppPCAHdf5 (unknown reason)");
        }
        
        return void();
        
    }

7 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = RcppPCAHdf5(...);