RcppPCAHdf5

C++ Function Reference

1 Signature

void BigDataStatMeth::RcppPCAHdf5(std::string filename, std::string strgroup, std::string strdataset, std::string strSVDgroup, int k, int q, int nev, bool bcenter, bool bscale, double dthreshold, bool bforce, bool asRowMajor, Rcpp::Nullable< Rcpp::CharacterVector > method=R_NilValue, Rcpp::Nullable< int > ithreads=R_NilValue)

2 Description

Perform Principal Component Analysis.

3 Parameters

  • filename (std::string): HDF5 file name
  • strgroup (std::string): Group name for results
  • strdataset (std::string): Dataset name
  • strSVDgroup (std::string): SVD group name
  • k (int): Number of components to compute
  • q (int): Block size for processing
  • nev (int): Number of eigenvalues
  • bcenter (bool): Whether to center the data
  • bscale (bool): Whether to scale the data
  • dthreshold (double): Convergence threshold
  • bforce (bool): Whether to force computation
  • asRowMajor (bool): Whether data is in row-major order
  • method (Rcpp::Nullable< Rcpp::CharacterVector >): Method selection (optional)
  • ithreads (Rcpp::Nullable< int >): Number of threads (optional)

4 Details

Performs PCA on an HDF5 dataset with options for:Full or truncated analysisData preprocessing (centering/scaling)Method selectionParallel processing

5 Call Graph

Function dependencies

6 Source Code

File: inst/include/hdf5Algebra/matrixPCA.hppLines 347-457

inline void RcppPCAHdf5( std::string filename, std::string strgroup, std::string strdataset,  
                             std::string strSVDgroup, int k, int q, int nev, 
                             bool bcenter, bool bscale, double dthreshold, 
                             bool bforce, bool asRowMajor, 
                             Rcpp::Nullable<Rcpp::CharacterVector> method = R_NilValue,
                             Rcpp::Nullable<int> ithreads = R_NilValue)
    {
        
        BigDataStatMeth::hdf5Dataset* dsA = nullptr;
        BigDataStatMeth::hdf5Dataset* dsd = nullptr;
        BigDataStatMeth::hdf5Dataset* dsu = nullptr;
        BigDataStatMeth::hdf5Dataset* dsv = nullptr;
        BigDataStatMeth::hdf5Dataset* dsX = nullptr;
        
        try{
            
    
            std::string strPCAgroup = "PCA/" + strdataset;
                        
            // Check for svd decomposition (u, v and d matrices) in hdf5 file or if we 
            // need to compute again the SVD ( foce = true )
            BigDataStatMeth::hdf5File* file = new BigDataStatMeth::hdf5File(filename, false);
            file->openFile("r");
            
            bool bexistsSVD = exists_HDF5_element(file->getFileptr(), strSVDgroup);
            bool bexistsPCA = exists_HDF5_element(file->getFileptr(), strPCAgroup);
            
            delete file; file = nullptr;
            
            if( bexistsSVD == 0 ||  bforce == true ) {
                
                dsA = new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false);
                dsA->openDataset();
                if( dsA->getDatasetptr() != nullptr ) {
                    RcppTypifyNormalizeHdf5( dsA, bcenter, bscale, false); // Normalize and tipify data ( ((x-mu)/(sd)) * 1/sqrt(n-1) )
                } else {
                    checkClose_file(dsA, dsd, dsu, dsv, dsX);
                    return void();
                }
                
                delete dsA; dsA = nullptr;
                
                BigDataStatMeth::RcppbdSVD_hdf5( filename, "NORMALIZED_T/" + strgroup, strdataset, k, q, nev, false, false, dthreshold, bforce, asRowMajor, method, ithreads );
                
                strSVDgroup = "SVD/" +  strdataset;
                
            }
            
            // Check if PCA decomposition exists
            if( bexistsPCA != 0  && bforce == false) {
                Rcpp::Rcout<<"PCA decomposition exits, please set overwrite = true to overwrite the existing results";
                return void();
            }
            
            // ------------ Variables ----------------
            
            dsd = new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "d", false );
            dsd->openDataset();
            
            dsv = new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "v", false );
            dsv->openDataset();
            
            if( dsd->getDatasetptr() != nullptr && dsv->getDatasetptr() != nullptr) {
                RcppGetPCAVariablesHdf5( strPCAgroup, dsd, dsv, bforce );
            }
            
            delete dsv; dsv = nullptr;
            // delete dsA;
            
            // ------------ Individuals ----------------
            
            dsX = new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false);
            dsX->openDataset();
            
            dsu = new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "u", false );
            dsu->openDataset();
            
            if( dsX->getDatasetptr() != nullptr && dsd->getDatasetptr() != nullptr && dsu->getDatasetptr() != nullptr) {
                RcppGetPCAIndividualsHdf5( strPCAgroup, dsX, dsd, dsu, bforce );
            }
            
            delete dsd; dsd = nullptr;
            delete dsu; dsu = nullptr;
            delete dsX; dsX = nullptr;
            // delete dsA;
            
        } catch( H5::FileIException& error ) { // catch failure caused by the H5File operations
            checkClose_file(dsA, dsd, dsu, dsv, dsX);
            Rcpp::Rcerr<<"\nc++ exception RcppPCAHdf5 (File IException)\n";
            return void();
        } catch( H5::DataSetIException& error ) { // catch failure caused by the DataSet operations
            checkClose_file(dsA, dsd, dsu, dsv, dsX);
            Rcpp::Rcerr<<"\nc++ exception RcppPCAHdf5 (DataSet IException)\n";
            return void();
        } catch( H5::DataSpaceIException& error ) { // catch failure caused by the DataSpace operations
            checkClose_file(dsA, dsd, dsu, dsv, dsX);
            Rcpp::Rcerr<<"\nc++ exception RcppPCAHdf5 (DataSpace IException)\n";
            return void();
        } catch(std::exception &ex) {
            checkClose_file(dsA, dsd, dsu, dsv, dsX);
            Rcpp::Rcerr<<"\nc++ exception RcppPCAHdf5 \n"<< ex.what();
            return void();
        } catch (...) {
            checkClose_file(dsA, dsd, dsu, dsv, dsX);
            Rcpp::Rcerr<<"\nC++ exception RcppPCAHdf5 (unknown reason)";
            return void();
        }
        
        return void();
        
    }

7 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = RcppPCAHdf5(...);