RcppbdCorr_hdf5_Block_single
C++ Function Reference
1 Signature
void BigDataStatMeth::RcppbdCorr_hdf5_Block_single(T *dsA, BigDataStatMeth::hdf5Dataset *dsCorr, BigDataStatMeth::hdf5Dataset *dsPval, const std::string &method="pearson", bool use_complete_obs=true, bool compute_pvalues=false, bool trans_x=false, int block_size=1000, Rcpp::Nullable< int > threads=R_NilValue)2 Parameters
dsA(T *)dsCorr(BigDataStatMeth::hdf5Dataset *)dsPval(BigDataStatMeth::hdf5Dataset *)method(const std::string &)use_complete_obs(bool)compute_pvalues(bool)trans_x(bool)block_size(int)threads(Rcpp::Nullable< int >)
3 Returns
Type: class T
4 Call Graph
5 Source Code
NoteImplementation
File: inst/include/hdf5Algebra/matrixCorrelation.hpp • Lines 1233-1366
inline void RcppbdCorr_hdf5_Block_single(T* dsA,
BigDataStatMeth::hdf5Dataset* dsCorr,
BigDataStatMeth::hdf5Dataset* dsPval,
const std::string& method = "pearson",
bool use_complete_obs = true,
bool compute_pvalues = false,
bool trans_x = false,
int block_size = 1000,
Rcpp::Nullable<int> threads = R_NilValue) {
static_assert(std::is_same<T*, BigDataStatMeth::hdf5Dataset*>::value ||
std::is_same<T*, BigDataStatMeth::hdf5DatasetInternal*>::value,
"Error - type not allowed");
try {
std::vector<hsize_t> stride = {1, 1}, block = {1, 1};
// CRITICAL FIX: R→HDF5 data is implicitly transposed
// HDF5 dimensions are NOT the "real" data dimensions from R
hsize_t n_rows_hdf5 = dsA->nrows(); // Variables in HDF5 (e.g., genes)
hsize_t n_cols_hdf5 = dsA->ncols(); // Observations in HDF5 (e.g., samples)
// Real dimensions (as they were in R before saving to HDF5):
hsize_t n_obs_real = n_cols_hdf5; // Real observations (samples)
hsize_t n_vars_real = n_rows_hdf5; // Real variables (genes)
// Apply user transpose logic to REAL dimensions
hsize_t n_rows = trans_x ? n_vars_real : n_obs_real; // Effective observations after user transpose
hsize_t n_cols = trans_x ? n_obs_real : n_vars_real; // Effective variables after user transpose
// Strategy: Read entire matrix if possible (most cases)
const hsize_t MEMORY_LIMIT = 500000; // 500K elements ~ 4GB for double
if (n_rows_hdf5 * n_cols_hdf5 < MEMORY_LIMIT) {
// Load full matrix and use efficient in-memory correlation with transpose
std::vector<double> matrix_data(n_rows_hdf5 * n_cols_hdf5);
dsA->readDatasetBlock({0, 0}, {n_rows_hdf5, n_cols_hdf5}, stride, block, matrix_data.data());
// CRITICAL FIX: Create Eigen matrix accounting for R→HDF5 transposition
// Data in HDF5 is transposed compared to how it was in R
Eigen::MatrixXd X = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
matrix_data.data(), n_rows_hdf5, n_cols_hdf5);
corr_result result = RcppbdCorr_matrix_single(X, method, use_complete_obs, compute_pvalues, trans_x, threads);
if (!result.bcomputed) {
checkClose_file(dsA, dsCorr, dsPval);
throw std::runtime_error("In-memory correlation computation failed");
}
// Write results
dsCorr->createDataset(result.correlation_matrix.rows(), result.correlation_matrix.cols(), "real");
dsCorr->writeDataset(Rcpp::wrap(result.correlation_matrix));
// Write p-values matrix if computed (vectorized result)
if (compute_pvalues && result.has_pvalues && dsPval && result.pvalues.rows() > 0 && result.pvalues.cols() > 0) {
dsPval->createDataset(result.pvalues.rows(), result.pvalues.cols(), "real");
dsPval->writeDataset(Rcpp::wrap(result.pvalues));
}
} else {
Eigen::MatrixXd corr_matrix = Eigen::MatrixXd::Identity(n_cols, n_cols);
#ifdef _OPENMP
int num_threads = std::min(2, omp_get_max_threads());
#pragma omp parallel for num_threads(num_threads) schedule(dynamic, 1)
#endif
for (hsize_t i = 0; i < n_cols; ++i) {
std::vector<double> vec_i_data(n_rows);
// CRITICAL FIX: Transpose-aware I/O patterns for HDF5 data (R→HDF5 transposition)
if (trans_x) {
// User wants correlation between samples
// In HDF5: samples are columns, so read column i
dsA->readDatasetBlock({0, i}, {n_rows_hdf5, 1}, stride, block, vec_i_data.data());
} else {
// User wants correlation between variables (default)
// In HDF5: variables are rows, so read row i
dsA->readDatasetBlock({i, 0}, {1, n_cols_hdf5}, stride, block, vec_i_data.data());
}
Eigen::VectorXd vec_i = Eigen::Map<Eigen::VectorXd>(vec_i_data.data(), n_rows);
// Compute correlations for upper triangle (symmetric matrix)
for (hsize_t j = i + 1; j < n_cols; ++j) {
std::vector<double> vec_j_data(n_rows);
// Consistent reading for second vector (final correction)
if (trans_x) {
// User wants correlation between samples
// Read column j (this currently gives correct cor(ds) result)
dsA->readDatasetBlock({0, j}, {n_rows_hdf5, 1}, stride, block, vec_j_data.data());
} else {
// User wants correlation between variables (default - same as cor(ds))
// Read row j (swap to make this give cor(ds) result)
dsA->readDatasetBlock({j, 0}, {1, n_cols_hdf5}, stride, block, vec_j_data.data());
}
Eigen::VectorXd vec_j = Eigen::Map<Eigen::VectorXd>(vec_j_data.data(), n_rows);
double corr_val;
if (method == "spearman") {
corr_val = spearman_correlation(vec_i, vec_j, use_complete_obs);
} else {
corr_val = pearson_correlation(vec_i, vec_j, use_complete_obs);
}
corr_matrix(i, j) = corr_val;
corr_matrix(j, i) = corr_val;
}
}
dsCorr->createDataset(corr_matrix.rows(), corr_matrix.cols(), "real");
dsCorr->writeDataset(Rcpp::wrap(corr_matrix));
if (compute_pvalues && dsPval) {
Eigen::MatrixXd pvalues_matrix = compute_pvalues_optimized(corr_matrix, n_rows, true);
dsPval->createDataset(pvalues_matrix.rows(), pvalues_matrix.cols(), "real");
dsPval->writeDataset(Rcpp::wrap(pvalues_matrix));
}
}
} catch(std::exception &ex) {
checkClose_file(dsA, dsCorr, dsPval);
Rcpp::Rcerr << "C++ exception RcppbdCorr_hdf5_Block_single: " << ex.what() << std::endl;
throw;
}
}6 Usage Example
#include "BigDataStatMeth.hpp"
// Example usage
auto result = RcppbdCorr_hdf5_Block_single(...);