Rcpp_Impute_snps_hdf5
C++ Function Reference
1 Signature
void BigDataStatMeth::Rcpp_Impute_snps_hdf5(BigDataStatMeth::hdf5Dataset *dsIn, BigDataStatMeth::hdf5DatasetInternal *dsOut, bool bycols, std::string stroutdataset, Rcpp::Nullable< int > threads=R_NilValue)2 Description
Imputes missing values in an HDF5 dataset.
3 Parameters
dsIn(BigDataStatMeth::hdf5Dataset *): Input HDF5 datasetdsOut(BigDataStatMeth::hdf5DatasetInternal *): Output HDF5 dataset for imputed databycols(bool): If true, process by columns; if false, process by rowsstroutdataset(std::string): Name for the output datasetthreads(Rcpp::Nullable< int >): Optional number of threads for parallel processing
4 Details
This function performs imputation of missing values (represented by 3) in an HDF5 dataset. It can operate on either rows or columns and supports parallel processing for improved performance.
5 Call Graph
6 Source Code
NoteImplementation
File: inst/include/hdf5Utilities/hdf5ImputeData.hpp • Lines 176-314
inline void Rcpp_Impute_snps_hdf5(BigDataStatMeth::hdf5Dataset* dsIn, BigDataStatMeth::hdf5DatasetInternal* dsOut,
bool bycols, std::string stroutdataset, Rcpp::Nullable<int> threads = R_NilValue)
{
try{
std::vector<hsize_t> stride = {1,1},
block = {1,1};
int ilimit,
blocksize = 1000;
hsize_t* dims_out = dsIn->dim();
// id bycols == true : read all rows by group of columns ; else : all columns by group of rows
if (bycols == true) {
ilimit = dims_out[0];
} else {
ilimit = dims_out[1];
};
if( stroutdataset.compare("")!=0) {
dsOut->createDataset(dims_out[0], dims_out[1], "real");
}
dsOut->openDataset();
int chunks = (ilimit + (blocksize - 1)) / blocksize; //(ilimit/blocksize);
#pragma omp parallel num_threads(get_number_threads(threads, R_NilValue)) shared(dsIn, dsOut, chunks)
{
#pragma omp for schedule(auto)
for( int i=0; i < chunks; i++)
{
std::vector<hsize_t> offset = {0,0},
count = {0,0};
int iread;
if( (i+1)*blocksize < ilimit) iread = blocksize;
else iread = ilimit - (i*blocksize);
// id bycols == true : read all rows by group of columns ; else : all columns by group of rows
if(bycols == true) {
count[0] = iread;
count[1] = dims_out[1];
offset[0] = i*blocksize;
} else {
count[0] = dims_out[0];
count[1] = iread;
offset[1] = i*blocksize;
}
// read block
std::vector<double> vdIn( count[0] * count[1] );
#pragma omp critical(accessFile)
{
dsIn->readDatasetBlock( { offset[0], offset[1]}, { count[0], count[1]}, stride, block, vdIn.data() );
}
Eigen::MatrixXd data = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> (vdIn.data(), count[0], count[1] );
if(bycols == true) // We have to do it by rows
{
for( int row = 0; row<data.rows(); row++) // COMPLETE EXECUTION
{
std::map<double, double> myMap;
myMap = VectortoOrderedMap_SNP_counts(data.row(row));
Eigen::VectorXd ev = data.row(row);
std::vector<double> v(ev.data(), ev.data() + ev.size());
auto it = std::find_if(std::begin(v), std::end(v), [](int i){return i == 3;});
while (it != std::end(v)) {
if(*it==3) *it = get_value_to_impute_discrete(myMap);
it = std::find_if(std::next(it), std::end(v), [](int i){return i == 3;});
}
Eigen::VectorXd X = Eigen::Map<Eigen::VectorXd>(v.data(), v.size());
data.row(row) = X;
}
} else {
for( int col = 0; col<data.cols(); col++)
{
std::map<double, double> myMap;
myMap = VectortoOrderedMap_SNP_counts(data.col(col));
Eigen::VectorXd ev = data.col(col);
std::vector<double> v(ev.data(), ev.data() + ev.size());
auto it = std::find_if(std::begin(v), std::end(v), [](int i){return i == 3;});
while (it != std::end(v)) {
if(*it==3) *it = get_value_to_impute_discrete(myMap);
it = std::find_if(std::next(it), std::end(v), [](int i){return i == 3;});
}
Eigen::VectorXd X = Eigen::Map<Eigen::VectorXd>(v.data(), v.size());
data.col(col) = X;
}
}
#pragma omp critical(accessFile)
{
dsOut->writeDatasetBlock( Rcpp::wrap(data), offset, count, stride, block, false);
}
}
}
} catch( H5::FileIException& error) { // catch failure caused by the H5File operations
checkClose_file(dsIn, dsOut);
Rf_error("c++ exception Rcpp_Impute_snps_hdf5 (File IException)");
} catch( H5::DataSetIException& error) { // catch failure caused by the DataSet operations
checkClose_file(dsIn, dsOut);
Rf_error("c++ exception Rcpp_Impute_snps_hdf5 (DataSet IException)");
} catch( H5::GroupIException& error) { // catch failure caused by the Group operations
checkClose_file(dsIn, dsOut);
Rf_error("c++ exception Rcpp_Impute_snps_hdf5 (Group IException)");
} catch( H5::DataSpaceIException& error) { // catch failure caused by the DataSpace operations
checkClose_file(dsIn, dsOut);
Rf_error("c++ exception Rcpp_Impute_snps_hdf5 (DataSpace IException)");
} catch( H5::DataTypeIException& error) { // catch failure caused by the DataSpace operations
checkClose_file(dsIn, dsOut);
Rf_error("c++ exception Rcpp_Impute_snps_hdf5 (Data TypeIException)");
} catch(std::exception &ex) {
checkClose_file(dsIn, dsOut);
Rf_error( "c++ exception Rcpp_Impute_snps_hdf5 : %s", ex.what());
} catch (...) {
checkClose_file(dsIn, dsOut);
Rf_error("C++ exception Rcpp_Impute_snps_hdf5 (unknown reason)");
}
return void();
}7 Usage Example
#include "BigDataStatMeth.hpp"
// Example usage
auto result = Rcpp_Impute_snps_hdf5(...);