RcppbdCorr_hdf5_single

C++ Function Reference

1 Signature

Rcpp::List BigDataStatMeth::RcppbdCorr_hdf5_single(const std::string &filename, const std::string &strsubgroup, const std::string &strdataset, const std::string &method, bool use_complete_obs, bool compute_pvalues, int block_size, bool bforce, const Rcpp::Nullable< Rcpp::CharacterVector > &output_group, const Rcpp::Nullable< Rcpp::CharacterVector > &output_dataset_corr, const Rcpp::Nullable< Rcpp::CharacterVector > &output_dataset_pval, bool trans_x, const Rcpp::Nullable< int > &threads)

2 Description

Implementation of single matrix correlation computation for HDF5 matrices - OPTIMIZED.

3 Parameters

  • filename (const std::string &): Path to HDF5 file containing the matrix
  • strsubgroup (const std::string &): Group path within HDF5 file
  • strdataset (const std::string &): Dataset name within the group
  • method (const std::string &): Correlation method (“pearson” or “spearman”)
  • use_complete_obs (bool): Whether to use only complete observations
  • compute_pvalues (bool): Whether to compute p-values (performance impact)
  • block_size (int): Block size for large matrix processing (default: 1000)
  • bforce (bool): Whether to overwrite existing results
  • output_group (const Rcpp::Nullable< Rcpp::CharacterVector > &): Custom output group name (nullable)
  • output_dataset_corr (const Rcpp::Nullable< Rcpp::CharacterVector > &): Custom correlation dataset name (nullable)
  • output_dataset_pval (const Rcpp::Nullable< Rcpp::CharacterVector > &): Custom p-values dataset name (nullable)
  • trans_x (bool): Whether to transpose the matrix (samples vs variables correlation)
  • threads (const Rcpp::Nullable< int > &): Number of threads for parallel computation (nullable)

4 Returns

Rcpp::List containing dataset locations and computation metadata

5 Details

Optimized implementation that computes correlation matrix for a single HDF5 dataset. Uses intelligent method selection between direct computation and block-wise processing based on matrix size and available memory.

6 Call Graph

Function dependencies

7 Source Code

File: inst/include/hdf5Algebra/matrixCorrelation.hppLines 1403-1562

inline Rcpp::List RcppbdCorr_hdf5_single(const std::string& filename, 
                                             const std::string& strsubgroup, 
                                             const std::string& strdataset,
                                             const std::string& method, 
                                             bool use_complete_obs,
                                             bool compute_pvalues, 
                                             int block_size, 
                                             bool bforce,
                                             const Rcpp::Nullable<Rcpp::CharacterVector>& output_group,
                                             const Rcpp::Nullable<Rcpp::CharacterVector>& output_dataset_corr,
                                             const Rcpp::Nullable<Rcpp::CharacterVector>& output_dataset_pval,
                                             bool trans_x,
                                             const Rcpp::Nullable<int>& threads) {
        
        BigDataStatMeth::hdf5Dataset* dsA = nullptr;
        BigDataStatMeth::hdf5Dataset* dsCorr = nullptr;
        BigDataStatMeth::hdf5Dataset* dsPval = nullptr;
        
        try {
            
            std::vector<hsize_t> stride = {1, 1}, block = {1, 1}, offset = {0, 0}, count = {0, 0};
            
            // Open input dataset
            dsA = new BigDataStatMeth::hdf5Dataset(filename, strsubgroup, strdataset, false);
            dsA->openDataset();
            
            if (dsA->getDatasetptr() == nullptr) {
                checkClose_file(dsA);
                throw std::runtime_error("Failed to open input dataset");
            }
            
            // Determine output group and dataset names
            std::string stroutgroup;
            std::string corr_dataset_name;
            std::string pval_dataset_name;
            
            if (output_group.isNull()) {
                stroutgroup = "CORR/" + strdataset;
            } else {
                Rcpp::CharacterVector group_vec = Rcpp::as<Rcpp::CharacterVector>(output_group);
                stroutgroup = Rcpp::as<std::string>(group_vec[0]);
            }
            
            if (output_dataset_corr.isNull()) {
                corr_dataset_name = "correlation";
            } else {
                Rcpp::CharacterVector corr_vec = Rcpp::as<Rcpp::CharacterVector>(output_dataset_corr);
                corr_dataset_name = Rcpp::as<std::string>(corr_vec[0]);
            }
            
            if (output_dataset_pval.isNull()) {
                pval_dataset_name = "pvalues";
            } else {
                Rcpp::CharacterVector pval_vec = Rcpp::as<Rcpp::CharacterVector>(output_dataset_pval);
                pval_dataset_name = Rcpp::as<std::string>(pval_vec[0]);
            }
            
            // Get matrix dimensions
            hsize_t n_rows_orig = dsA->nrows();
            hsize_t n_cols_orig = dsA->ncols();
            
            // Effective dimensions after transposition
            hsize_t n_rows = trans_x ? n_cols_orig : n_rows_orig;
            hsize_t n_cols = trans_x ? n_rows_orig : n_cols_orig;
            count = {n_rows_orig, n_cols_orig};
            
            // Create output datasets
            try {
                dsCorr = new BigDataStatMeth::hdf5Dataset(filename, stroutgroup, corr_dataset_name, bforce);
                if (compute_pvalues) {
                    dsPval = new BigDataStatMeth::hdf5Dataset(filename, stroutgroup, pval_dataset_name, bforce);
                }
            } catch (const std::exception& e) {
                Rcpp::Rcerr << "Error creating output datasets: " << e.what() << std::endl;
                checkClose_file(dsA, dsCorr, dsPval);
                return R_NilValue;
            }
            
            // Automatic method selection based on matrix size
            const hsize_t DIRECT_COMPUTATION_THRESHOLD = MAXELEMSINBLOCK / 4;
            
            if (n_rows * n_cols < DIRECT_COMPUTATION_THRESHOLD) {
                
                // Direct computation for small matrices
                std::vector<double> vdA(count[0] * count[1]);
                dsA->readDatasetBlock({offset[0], offset[1]}, {count[0], count[1]}, stride, block, vdA.data());
                
                // Convert to Eigen matrix (row-major)
                Eigen::MatrixXd X = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
                    vdA.data(), count[0], count[1]);
                
                // Compute correlation
                corr_result result = RcppbdCorr_matrix_single(X, method, use_complete_obs, compute_pvalues, trans_x, threads);
                
                if (!result.bcomputed) {
                    checkClose_file(dsA, dsCorr, dsPval);
                    throw std::runtime_error("Single matrix correlation computation failed");
                }
                
                // Write correlation matrix
                if (dsCorr->getDatasetptr() == nullptr) {
                    dsCorr->createDataset(result.correlation_matrix.rows(), result.correlation_matrix.cols(), "real");
                }
                dsCorr->writeDataset(Rcpp::wrap(result.correlation_matrix));

                
                // Write p-values matrix if computed
                if (compute_pvalues && result.has_pvalues && dsPval && result.pvalues.rows() > 0 && result.pvalues.cols() > 0) {
                    if (dsPval->getDatasetptr() == nullptr) {
                        dsPval->createDataset(result.pvalues.rows(), result.pvalues.cols(), "real");
                    }
                    dsPval->writeDataset(Rcpp::wrap(result.pvalues));
                }                
                
            } else {
                
                // Block-wise computation for large matrices - CRITICAL path for big-omics
                if (dsA->getDatasetptr() != nullptr) {
                    RcppbdCorr_hdf5_Block_single(dsA, dsCorr, dsPval, method, use_complete_obs, 
                                                           compute_pvalues, trans_x, block_size, threads);
                }
            }
            
            // Clean up datasets
            delete dsA; dsA = nullptr;
            delete dsCorr; dsCorr = nullptr;
            delete dsPval; dsPval = nullptr;
            
            // Return comprehensive result list
            return Rcpp::List::create(
                Rcpp::Named("filename") = filename,
                Rcpp::Named("group") = stroutgroup,
                Rcpp::Named("correlation") = corr_dataset_name,
                Rcpp::Named("method") = method,
                Rcpp::Named("correlation_type") = "single",
                Rcpp::Named("n_variables") = (int)n_cols,
                Rcpp::Named("n_observations") = (int)n_rows,
                Rcpp::Named("use_complete_obs") = use_complete_obs,
                Rcpp::Named("pvalues") = compute_pvalues ? pval_dataset_name : "",
                Rcpp::Named("has_pvalues") = compute_pvalues
            );
            
        } catch(H5::FileIException& error) {
            checkClose_file(dsA, dsCorr, dsPval);
            Rcpp::Rcerr << "\nC++ exception RcppbdCorr_hdf5_single (File IException): " << error.getDetailMsg() << std::endl;
            return R_NilValue;
        } catch(H5::DataSetIException& error) {
            checkClose_file(dsA, dsCorr, dsPval);
            Rcpp::Rcerr << "\nC++ exception RcppbdCorr_hdf5_single (DataSet IException): " << error.getDetailMsg() << std::endl;
            return R_NilValue;
        } catch(std::exception &ex) {
            checkClose_file(dsA, dsCorr, dsPval);
            Rcpp::Rcerr << "C++ exception RcppbdCorr_hdf5_single: " << ex.what() << std::endl;
            return R_NilValue;
        } catch (...) {
            checkClose_file(dsA, dsCorr, dsPval);
            Rcpp::Rcerr << "\nC++ exception RcppbdCorr_hdf5_single (unknown reason)" << std::endl;
            return R_NilValue;
        }
    }

8 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = RcppbdCorr_hdf5_single(...);