RcppbdCorr_hdf5_Block_single

C++ Function Reference

1 Signature

void BigDataStatMeth::RcppbdCorr_hdf5_Block_single(T *dsA, BigDataStatMeth::hdf5Dataset *dsCorr, BigDataStatMeth::hdf5Dataset *dsPval, const std::string &method="pearson", bool use_complete_obs=true, bool compute_pvalues=false, bool trans_x=false, int block_size=1000, Rcpp::Nullable< int > threads=R_NilValue)

2 Parameters

  • dsA (T *)
  • dsCorr (BigDataStatMeth::hdf5Dataset *)
  • dsPval (BigDataStatMeth::hdf5Dataset *)
  • method (const std::string &)
  • use_complete_obs (bool)
  • compute_pvalues (bool)
  • trans_x (bool)
  • block_size (int)
  • threads (Rcpp::Nullable< int >)

3 Returns

Type: class T

4 Call Graph

Function dependencies

5 Source Code

File: inst/include/hdf5Algebra/matrixCorrelation.hppLines 1233-1366

inline void RcppbdCorr_hdf5_Block_single(T* dsA, 
                                             BigDataStatMeth::hdf5Dataset* dsCorr,
                                             BigDataStatMeth::hdf5Dataset* dsPval,
                                             const std::string& method = "pearson",
                                             bool use_complete_obs = true,
                                             bool compute_pvalues = false,
                                             bool trans_x = false,
                                             int block_size = 1000,
                                             Rcpp::Nullable<int> threads = R_NilValue) {
        
        static_assert(std::is_same<T*, BigDataStatMeth::hdf5Dataset*>::value ||
                      std::is_same<T*, BigDataStatMeth::hdf5DatasetInternal*>::value,
                      "Error - type not allowed");
        
        try {
            std::vector<hsize_t> stride = {1, 1}, block = {1, 1};
            
            // CRITICAL FIX: R→HDF5 data is implicitly transposed
            // HDF5 dimensions are NOT the "real" data dimensions from R
            hsize_t n_rows_hdf5 = dsA->nrows();   // Variables in HDF5 (e.g., genes)
            hsize_t n_cols_hdf5 = dsA->ncols();   // Observations in HDF5 (e.g., samples)
            
            // Real dimensions (as they were in R before saving to HDF5):
            hsize_t n_obs_real = n_cols_hdf5;     // Real observations (samples)
            hsize_t n_vars_real = n_rows_hdf5;    // Real variables (genes)
            
            // Apply user transpose logic to REAL dimensions
            hsize_t n_rows = trans_x ? n_vars_real : n_obs_real;   // Effective observations after user transpose
            hsize_t n_cols = trans_x ? n_obs_real : n_vars_real;   // Effective variables after user transpose
            
            // Strategy: Read entire matrix if possible (most cases)
            const hsize_t MEMORY_LIMIT = 500000; // 500K elements ~ 4GB for double
            
            if (n_rows_hdf5 * n_cols_hdf5 < MEMORY_LIMIT) {
                
                // Load full matrix and use efficient in-memory correlation with transpose
                std::vector<double> matrix_data(n_rows_hdf5 * n_cols_hdf5);
                dsA->readDatasetBlock({0, 0}, {n_rows_hdf5, n_cols_hdf5}, stride, block, matrix_data.data());
                
                // CRITICAL FIX: Create Eigen matrix accounting for R→HDF5 transposition
                // Data in HDF5 is transposed compared to how it was in R
                Eigen::MatrixXd X = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
                    matrix_data.data(), n_rows_hdf5, n_cols_hdf5);
                
                corr_result result = RcppbdCorr_matrix_single(X, method, use_complete_obs, compute_pvalues, trans_x, threads);
                
                if (!result.bcomputed) {
                    checkClose_file(dsA, dsCorr, dsPval);
                    throw std::runtime_error("In-memory correlation computation failed");
                }
                
                // Write results
                dsCorr->createDataset(result.correlation_matrix.rows(), result.correlation_matrix.cols(), "real");
                dsCorr->writeDataset(Rcpp::wrap(result.correlation_matrix));
                
                // Write p-values matrix if computed (vectorized result)
                if (compute_pvalues && result.has_pvalues && dsPval && result.pvalues.rows() > 0 && result.pvalues.cols() > 0) {
                    dsPval->createDataset(result.pvalues.rows(), result.pvalues.cols(), "real");
                    dsPval->writeDataset(Rcpp::wrap(result.pvalues));
                }
                
            } else {
                
                Eigen::MatrixXd corr_matrix = Eigen::MatrixXd::Identity(n_cols, n_cols);
                
                
                
#ifdef _OPENMP
                int num_threads = std::min(2, omp_get_max_threads());
            #pragma omp parallel for num_threads(num_threads) schedule(dynamic, 1)
#endif
                for (hsize_t i = 0; i < n_cols; ++i) {
                    std::vector<double> vec_i_data(n_rows);
                    
                    // CRITICAL FIX: Transpose-aware I/O patterns for HDF5 data (R→HDF5 transposition)
                    if (trans_x) {
                        // User wants correlation between samples
                        // In HDF5: samples are columns, so read column i
                        dsA->readDatasetBlock({0, i}, {n_rows_hdf5, 1}, stride, block, vec_i_data.data());
                    } else {
                        // User wants correlation between variables (default)
                        // In HDF5: variables are rows, so read row i
                        dsA->readDatasetBlock({i, 0}, {1, n_cols_hdf5}, stride, block, vec_i_data.data());
                    }
                    
                    Eigen::VectorXd vec_i = Eigen::Map<Eigen::VectorXd>(vec_i_data.data(), n_rows);
                    
                    // Compute correlations for upper triangle (symmetric matrix)
                    for (hsize_t j = i + 1; j < n_cols; ++j) {
                        std::vector<double> vec_j_data(n_rows);
                        
                        // Consistent reading for second vector (final correction)
                        if (trans_x) {
                            // User wants correlation between samples
                            // Read column j (this currently gives correct cor(ds) result)
                            dsA->readDatasetBlock({0, j}, {n_rows_hdf5, 1}, stride, block, vec_j_data.data());
                        } else {
                            // User wants correlation between variables (default - same as cor(ds))
                            // Read row j (swap to make this give cor(ds) result)
                            dsA->readDatasetBlock({j, 0}, {1, n_cols_hdf5}, stride, block, vec_j_data.data());
                        }
                        
                        Eigen::VectorXd vec_j = Eigen::Map<Eigen::VectorXd>(vec_j_data.data(), n_rows);
                        
                        double corr_val;
                        if (method == "spearman") {
                            corr_val = spearman_correlation(vec_i, vec_j, use_complete_obs);
                        } else {
                            corr_val = pearson_correlation(vec_i, vec_j, use_complete_obs);
                        }
                        
                        corr_matrix(i, j) = corr_val;
                        corr_matrix(j, i) = corr_val;
                        
                    }
                }
                
                dsCorr->createDataset(corr_matrix.rows(), corr_matrix.cols(), "real");
                dsCorr->writeDataset(Rcpp::wrap(corr_matrix));
                
                if (compute_pvalues && dsPval) {
                    Eigen::MatrixXd pvalues_matrix = compute_pvalues_optimized(corr_matrix, n_rows, true);
                    dsPval->createDataset(pvalues_matrix.rows(), pvalues_matrix.cols(), "real");
                    dsPval->writeDataset(Rcpp::wrap(pvalues_matrix));
                }
            }
            
            
        } catch(std::exception &ex) {
            checkClose_file(dsA, dsCorr, dsPval);
            Rcpp::Rcerr << "C++ exception RcppbdCorr_hdf5_Block_single: " << ex.what() << std::endl;
            throw;
        }
    }

6 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = RcppbdCorr_hdf5_Block_single(...);