RcppbdCorr_hdf5_cross

C++ Function Reference

1 Signature

Rcpp::List BigDataStatMeth::RcppbdCorr_hdf5_cross(const std::string &filename_a, const std::string &strsubgroup_a, const std::string &strdataset_a, const std::string &filename_b, const std::string &strsubgroup_b, const std::string &strdataset_b, const std::string &method, bool use_complete_obs, bool compute_pvalues, int block_size, bool bforce, const std::string &output_filename, const Rcpp::Nullable< Rcpp::CharacterVector > &output_group, const Rcpp::Nullable< Rcpp::CharacterVector > &output_dataset_corr, const Rcpp::Nullable< Rcpp::CharacterVector > &output_dataset_pval, bool trans_x, bool trans_y, const Rcpp::Nullable< int > &threads)

2 Description

Implementation of cross-matrix correlation computation for HDF5 matrices - OPTIMIZED.

3 Parameters

  • filename_a (const std::string &): Path to HDF5 file containing first matrix
  • strsubgroup_a (const std::string &): Group path for first matrix
  • strdataset_a (const std::string &): Dataset name for first matrix
  • filename_b (const std::string &): Path to HDF5 file containing second matrix
  • strsubgroup_b (const std::string &): Group path for second matrix
  • strdataset_b (const std::string &): Dataset name for second matrix
  • method (const std::string &): Correlation method (“pearson” or “spearman”)
  • use_complete_obs (bool): Whether to use only complete observations
  • compute_pvalues (bool): Whether to compute p-values
  • block_size (int): Block size for processing
  • bforce (bool): Whether to overwrite existing results
  • output_filename (const std::string &): Output HDF5 file path
  • output_group (const Rcpp::Nullable< Rcpp::CharacterVector > &): Custom output group name (nullable)
  • output_dataset_corr (const Rcpp::Nullable< Rcpp::CharacterVector > &): Custom correlation dataset name (nullable)
  • output_dataset_pval (const Rcpp::Nullable< Rcpp::CharacterVector > &): Custom p-values dataset name (nullable)
  • trans_x (bool): Whether to transpose first matrix
  • trans_y (bool): Whether to transpose second matrix
  • threads (const Rcpp::Nullable< int > &): Number of threads (nullable)

4 Returns

Rcpp::List containing dataset locations and computation metadata

5 Details

Optimized implementation that computes cross-correlation matrix between two HDF5 datasets. Uses intelligent method selection and efficient memory management for optimal performance.

6 Call Graph

Function dependencies

7 Source Code

File: inst/include/hdf5Algebra/matrixCorrelation.hppLines 1859-2053

inline Rcpp::List RcppbdCorr_hdf5_cross(const std::string& filename_a, 
                                            const std::string& strsubgroup_a, 
                                            const std::string& strdataset_a,
                                            const std::string& filename_b, 
                                            const std::string& strsubgroup_b, 
                                            const std::string& strdataset_b,
                                            const std::string& method, 
                                            bool use_complete_obs,
                                            bool compute_pvalues, 
                                            int block_size, 
                                            bool bforce,
                                            const std::string& output_filename,
                                            const Rcpp::Nullable<Rcpp::CharacterVector>& output_group,
                                            const Rcpp::Nullable<Rcpp::CharacterVector>& output_dataset_corr,
                                            const Rcpp::Nullable<Rcpp::CharacterVector>& output_dataset_pval,
                                            bool trans_x,
                                            bool trans_y,
                                            const Rcpp::Nullable<int>& threads) {
        
        BigDataStatMeth::hdf5Dataset* dsA = nullptr;
        BigDataStatMeth::hdf5Dataset* dsB = nullptr;
        BigDataStatMeth::hdf5Dataset* dsCorr = nullptr;
        BigDataStatMeth::hdf5Dataset* dsPval = nullptr;
        
        try {
            
            // std::vector<hsize_t> stride = {1, 1}, block = {1, 1}, offset = {0, 0};
            
            // Open input datasets
            dsA = new BigDataStatMeth::hdf5Dataset(filename_a, strsubgroup_a, strdataset_a, false);
            dsA->openDataset();
            
            if (dsA->getDatasetptr() == nullptr) {
                checkClose_file(dsA);
                throw std::runtime_error("Failed to open first input dataset");
            }
            
            dsB = new BigDataStatMeth::hdf5Dataset(filename_b, strsubgroup_b, strdataset_b, false);
            dsB->openDataset();
            
            if (dsB->getDatasetptr() == nullptr) {
                checkClose_file(dsA, dsB);
                throw std::runtime_error("Failed to open second input dataset");
            }
            
            // Get original dimensions
            hsize_t n_rows_a_orig = dsA->nrows();
            hsize_t n_cols_a_orig = dsA->ncols();
            hsize_t n_rows_b_orig = dsB->nrows();
            hsize_t n_cols_b_orig = dsB->ncols();
            
            // Determine effective dimensions after transposition
            hsize_t n_rows_a = trans_x ? n_cols_a_orig : n_rows_a_orig;
            hsize_t n_cols_a = trans_x ? n_rows_a_orig : n_cols_a_orig;
            hsize_t n_rows_b = trans_y ? n_cols_b_orig : n_rows_b_orig;
            hsize_t n_cols_b = trans_y ? n_rows_b_orig : n_cols_b_orig;
            
            // Validate dimensions
            if (n_rows_a != n_rows_b) {
                checkClose_file(dsA, dsB);
                throw std::runtime_error("Matrices must have same number of observations after transposition");
            }
            
            // Determine output group and dataset names
            std::string stroutgroup;
            std::string corr_dataset_name;
            std::string pval_dataset_name;
            
            
            if (output_group.isNull()) {
                std::string trans_suffix = "";
                if (trans_x && !trans_y) trans_suffix = "_TX";
                else if (!trans_x && trans_y) trans_suffix = "_TY";
                else if (trans_x && trans_y) trans_suffix = "_TXTY";
                
                stroutgroup = "CORR" + trans_suffix + "/" + strdataset_a + "_vs_" + strdataset_b;
            } else {
                Rcpp::CharacterVector group_vec = Rcpp::as<Rcpp::CharacterVector>(output_group);
                stroutgroup = Rcpp::as<std::string>(group_vec[0]);
            }
            
            if (output_dataset_corr.isNull()) {
                corr_dataset_name = "correlation";
            } else {
                Rcpp::CharacterVector corr_vec = Rcpp::as<Rcpp::CharacterVector>(output_dataset_corr);
                corr_dataset_name = Rcpp::as<std::string>(corr_vec[0]);
            }
            
            if (output_dataset_pval.isNull()) {
                pval_dataset_name = "pvalues";
            } else {
                Rcpp::CharacterVector pval_vec = Rcpp::as<Rcpp::CharacterVector>(output_dataset_pval);
                pval_dataset_name = Rcpp::as<std::string>(pval_vec[0]);
            }
            
            // Create output datasets
            try {
                dsCorr = new BigDataStatMeth::hdf5Dataset(output_filename, stroutgroup, corr_dataset_name, bforce);
                if (compute_pvalues) {
                    dsPval = new BigDataStatMeth::hdf5Dataset(output_filename, stroutgroup, pval_dataset_name, bforce);
                }
            } catch (const std::exception& e) {
                Rcpp::Rcerr << "Error creating output datasets: " << e.what() << std::endl;
                checkClose_file(dsA, dsB, dsCorr, dsPval);
                return R_NilValue;
            }
            
            // Automatic method selection based on total matrix size
            const hsize_t DIRECT_COMPUTATION_THRESHOLD = MAXELEMSINBLOCK / 4;
            hsize_t total_elements = std::max(n_rows_a_orig * n_cols_a_orig, n_rows_b_orig * n_cols_b_orig);
            
            if (total_elements < DIRECT_COMPUTATION_THRESHOLD) {
                
                // Direct computation for smaller matrices
                std::vector<double> vdA(n_rows_a_orig * n_cols_a_orig);
                std::vector<double> vdB(n_rows_b_orig * n_cols_b_orig);
                
                std::vector<hsize_t> stride = {1, 1}, block = {1, 1};
                dsA->readDatasetBlock({0, 0}, {n_rows_a_orig, n_cols_a_orig}, stride, block, vdA.data());
                dsB->readDatasetBlock({0, 0}, {n_rows_b_orig, n_cols_b_orig}, stride, block, vdB.data());
                
                Eigen::MatrixXd X = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
                    vdA.data(), n_rows_a_orig, n_cols_a_orig);
                Eigen::MatrixXd Y = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
                    vdB.data(), n_rows_b_orig, n_cols_b_orig);
                
                corr_result result = RcppbdCorr_matrix_cross(X, Y, method, use_complete_obs, compute_pvalues, trans_x, trans_y, threads);
                
                if (!result.bcomputed) {
                    checkClose_file(dsA, dsB, dsCorr, dsPval);
                    throw std::runtime_error("Cross-correlation computation failed");
                }
                
                // Write results
                if (dsCorr->getDatasetptr() == nullptr) {
                    dsCorr->createDataset(result.correlation_matrix.rows(), result.correlation_matrix.cols(), "real");
                }
                dsCorr->writeDataset(Rcpp::wrap(result.correlation_matrix));
                
                if (compute_pvalues && result.has_pvalues && dsPval && result.pvalues.rows() > 0 && result.pvalues.cols() > 0) {
                    if (dsPval->getDatasetptr() == nullptr) {
                        dsPval->createDataset(result.pvalues.rows(), result.pvalues.cols(), "real");
                    }
                    dsPval->writeDataset(Rcpp::wrap(result.pvalues));
                }
                
            } else {
                
                // Block-wise computation for large matrices
                if (dsA->getDatasetptr() != nullptr && dsB->getDatasetptr() != nullptr) {
                    RcppbdCorr_hdf5_Block_cross(dsA, dsB, dsCorr, dsPval, method, use_complete_obs, 
                                                compute_pvalues, block_size, trans_x, trans_y, threads);
                }
            }
            
            // Clean up datasets
            delete dsA; dsA = nullptr;
            delete dsB; dsB = nullptr;
            delete dsCorr; dsCorr = nullptr;
            delete dsPval; dsPval = nullptr;
            
            // Return comprehensive result list
            return Rcpp::List::create(
                Rcpp::Named("filename") = output_filename,
                Rcpp::Named("group") = stroutgroup,
                Rcpp::Named("correlation") = corr_dataset_name,
                Rcpp::Named("method") = method,
                Rcpp::Named("correlation_type") = "cross",
                Rcpp::Named("trans_x") = trans_x,
                Rcpp::Named("trans_y") = trans_y,
                Rcpp::Named("n_variables_x") = (int)n_cols_a,
                Rcpp::Named("n_variables_y") = (int)n_cols_b,
                Rcpp::Named("n_observations") = (int)n_rows_a,
                Rcpp::Named("pvalues") = compute_pvalues ? pval_dataset_name : "",
                Rcpp::Named("has_pvalues") = compute_pvalues
            );
            
        } catch(H5::FileIException& error) {
            checkClose_file(dsA, dsB, dsCorr, dsPval);
            Rcpp::Rcerr << "\nC++ exception RcppbdCorr_hdf5_cross (File IException): " << error.getDetailMsg() << std::endl;
            return R_NilValue;
        } catch(H5::DataSetIException& error) {
            checkClose_file(dsA, dsB, dsCorr, dsPval);
            Rcpp::Rcerr << "\nC++ exception RcppbdCorr_hdf5_cross (DataSet IException): " << error.getDetailMsg() << std::endl;
            return R_NilValue;
        } catch(std::exception &ex) {
            checkClose_file(dsA, dsB, dsCorr, dsPval);
            Rcpp::Rcerr << "C++ exception RcppbdCorr_hdf5_cross: " << ex.what() << std::endl;
            return R_NilValue;
        } catch (...) {
            checkClose_file(dsA, dsB, dsCorr, dsPval);
            Rcpp::Rcerr << "\nC++ exception RcppbdCorr_hdf5_cross (unknown reason)" << std::endl;
            return R_NilValue;
        }
    }

8 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = RcppbdCorr_hdf5_cross(...);