RcppPCAHdf5
C++ Function Reference
1 Signature
void BigDataStatMeth::RcppPCAHdf5(std::string filename, std::string strgroup, std::string strdataset, std::string strSVDgroup, int k, int q, int nev, bool bcenter, bool bscale, double dthreshold, bool bforce, bool asRowMajor, Rcpp::Nullable< Rcpp::CharacterVector > method=R_NilValue, Rcpp::Nullable< int > ithreads=R_NilValue)2 Description
Perform Principal Component Analysis.
3 Parameters
filename(std::string): HDF5 file namestrgroup(std::string): Group name for resultsstrdataset(std::string): Dataset namestrSVDgroup(std::string): SVD group namek(int): Number of components to computeq(int): Block size for processingnev(int): Number of eigenvaluesbcenter(bool): Whether to center the databscale(bool): Whether to scale the datadthreshold(double): Convergence thresholdbforce(bool): Whether to force computationasRowMajor(bool): Whether data is in row-major ordermethod(Rcpp::Nullable< Rcpp::CharacterVector >): Method selection (optional)ithreads(Rcpp::Nullable< int >): Number of threads (optional)
4 Details
Performs PCA on an HDF5 dataset with options for:Full or truncated analysisData preprocessing (centering/scaling)Method selectionParallel processing
5 Call Graph
6 Source Code
NoteImplementation
File: inst/include/hdf5Algebra/matrixPCA.hpp • Lines 360-482
inline void RcppPCAHdf5( std::string filename, std::string strgroup, std::string strdataset,
std::string strSVDgroup, int k, int q, int nev,
bool bcenter, bool bscale, double dthreshold,
bool bforce, bool asRowMajor,
Rcpp::Nullable<Rcpp::CharacterVector> method = R_NilValue,
Rcpp::Nullable<int> ithreads = R_NilValue)
{
try{
H5::Exception::dontPrint();
std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsA(nullptr);
std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsd(nullptr);
std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsu(nullptr);
std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsv(nullptr);
std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsX(nullptr);
std::string strPCAgroup = "PCA/" + strdataset;
bool bexistsSVD, bexistsPCA;
// Check for svd decomposition (u, v and d matrices) in hdf5 file or if we
// need to compute again the SVD ( foce = true )
// BigDataStatMeth::hdf5File* file = new BigDataStatMeth::hdf5File(filename, false);
{
std::unique_ptr<BigDataStatMeth::hdf5File> file(nullptr);
file.reset( new BigDataStatMeth::hdf5File(filename, false) );
file->openFile("r");
bexistsSVD = exists_HDF5_element(file->getFileptr(), strSVDgroup);
bexistsPCA = exists_HDF5_element(file->getFileptr(), strPCAgroup);
}
// {
// // Open via hdf5Dataset (RDWR) — mirrors SVD path.
// // Opening RDONLY when the file is already open RDWR in the same
// // process fails on macOS; RDWR reuses the existing file ID correctly.
// std::unique_ptr<BigDataStatMeth::hdf5Dataset> dsTmp(
// new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false));
// bexistsSVD = exists_HDF5_element(dsTmp->getFileptr(), strSVDgroup);
// bexistsPCA = exists_HDF5_element(dsTmp->getFileptr(), strPCAgroup);
// }
if( bexistsSVD == 0 || bforce == true )
{
// dsA = new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false);
dsA.reset( new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false) );
dsA->openDataset();
if( dsA->getDatasetptr() != nullptr ) {
RcppTypifyNormalizeHdf5( dsA.get(), bcenter, bscale, false); // Normalize and tipify data ( ((x-mu)/(sd)) * 1/sqrt(n-1) )
} else {
// checkClose_file(dsA, dsd, dsu, dsv, dsX);
return void();
}
// delete dsA; dsA = nullptr;
BigDataStatMeth::RcppbdSVD_hdf5( filename, "NORMALIZED_T/" + strgroup, strdataset, k, q, nev, false, false, dthreshold, bforce, asRowMajor, method, ithreads );
strSVDgroup = "SVD/" + strdataset;
} else {
strSVDgroup = strSVDgroup + strdataset; // 20260227
}
// Check if PCA decomposition exists
if( bexistsPCA != 0 && bforce == false) {
Rcpp::Rcout<<"PCA decomposition exits, please set overwrite = true to overwrite the existing results";
return void();
}
// ------------ Variables ----------------
// dsd = new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "d", false );
dsd.reset( new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "d", false ) );
dsd->openDataset();
// dsv = new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "v", false );
dsv.reset( new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "v", false ) );
dsv->openDataset();
if( dsd->getDatasetptr() != nullptr && dsv->getDatasetptr() != nullptr) {
RcppGetPCAVariablesHdf5( strPCAgroup, dsd.get(), dsv.get(), bforce );
}
// delete dsv; dsv = nullptr;
// delete dsA;
// ------------ Individuals ----------------
// dsX = new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false);
dsX.reset( new BigDataStatMeth::hdf5Dataset(filename, strgroup, strdataset, false) );
dsX->openDataset();
// dsu = new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "u", false );
dsu.reset( new BigDataStatMeth::hdf5Dataset(filename, strSVDgroup, "u", false ) );
dsu->openDataset();
if( dsX->getDatasetptr() != nullptr && dsd->getDatasetptr() != nullptr && dsu->getDatasetptr() != nullptr) {
RcppGetPCAIndividualsHdf5( strPCAgroup, dsX.get(), dsd.get(), dsu.get(), bforce );
}
} catch( H5::FileIException& error ) { // catch failure caused by the H5File operations
// checkClose_file(dsA, dsd, dsu, dsv, dsX);
throw std::runtime_error("c++ exception RcppPCAHdf5 (File IException)");
} catch( H5::DataSetIException& error ) { // catch failure caused by the DataSet operations
// checkClose_file(dsA, dsd, dsu, dsv, dsX);
throw std::runtime_error("c++ exception RcppPCAHdf5 (DataSet IException)");
} catch( H5::DataSpaceIException& error ) { // catch failure caused by the DataSpace operations
// checkClose_file(dsA, dsd, dsu, dsv, dsX);
throw std::runtime_error("c++ exception RcppPCAHdf5 (DataSpace IException)");
} catch(std::exception &ex) {
// checkClose_file(dsA, dsd, dsu, dsv, dsX);
throw std::runtime_error(std::string("c++ exception RcppPCAHdf5: ") + ex.what());
} catch (...) {
// checkClose_file(dsA, dsd, dsu, dsv, dsX);
throw std::runtime_error("C++ exception RcppPCAHdf5 (unknown reason)");
}
return void();
}7 Usage Example
#include "BigDataStatMeth.hpp"
// Example usage
auto result = RcppPCAHdf5(...);