Rcpp_Impute_snps_hdf5

C++ Function Reference

1 Signature

void BigDataStatMeth::Rcpp_Impute_snps_hdf5(BigDataStatMeth::hdf5Dataset *dsIn, BigDataStatMeth::hdf5DatasetInternal *dsOut, bool bycols, std::string stroutdataset, Rcpp::Nullable< int > threads=R_NilValue)

2 Description

Imputes missing values in an HDF5 dataset.

3 Parameters

  • dsIn (BigDataStatMeth::hdf5Dataset *): Input HDF5 dataset
  • dsOut (BigDataStatMeth::hdf5DatasetInternal *): Output HDF5 dataset for imputed data
  • bycols (bool): If true, process by columns; if false, process by rows
  • stroutdataset (std::string): Name for the output dataset
  • threads (Rcpp::Nullable< int >): Optional number of threads for parallel processing

4 Details

This function performs imputation of missing values (represented by 3) in an HDF5 dataset. It can operate on either rows or columns and supports parallel processing for improved performance.

5 Call Graph

Function dependencies

6 Source Code

File: inst/include/hdf5Utilities/hdf5ImputeData.hppLines 176-314

inline void Rcpp_Impute_snps_hdf5(BigDataStatMeth::hdf5Dataset* dsIn, BigDataStatMeth::hdf5DatasetInternal* dsOut,
                         bool bycols, std::string stroutdataset, Rcpp::Nullable<int> threads  = R_NilValue)
    {
        
        try{
            
            std::vector<hsize_t> stride = {1,1},
                                 block = {1,1};
            
            int ilimit,
                blocksize = 1000;
            
            hsize_t* dims_out = dsIn->dim();
            
            // id bycols == true : read all rows by group of columns ; else : all columns by group of rows
            if (bycols == true) {
                ilimit = dims_out[0];
            } else {
                ilimit = dims_out[1];
            };
            
            
            if( stroutdataset.compare("")!=0) {
                dsOut->createDataset(dims_out[0], dims_out[1], "real");
            } 
            
            dsOut->openDataset();
            
            int chunks = (ilimit + (blocksize - 1)) / blocksize; //(ilimit/blocksize);

            #pragma omp parallel num_threads(get_number_threads(threads, R_NilValue)) shared(dsIn, dsOut, chunks)
            {
                #pragma omp for schedule(auto)
                for( int i=0; i < chunks; i++) 
                {
                    
                    std::vector<hsize_t> offset = {0,0},
                                         count = {0,0};
                    
                    int iread;
                    
                    if( (i+1)*blocksize < ilimit) iread = blocksize;
                    else iread = ilimit - (i*blocksize);
                    
                    // id bycols == true : read all rows by group of columns ; else : all columns by group of rows
                    if(bycols == true) {
                        count[0] = iread; 
                        count[1] = dims_out[1];
                        offset[0] = i*blocksize;
                    } else {
                        count[0] = dims_out[0];
                        count[1] = iread; 
                        offset[1] = i*blocksize;
                    }
                    
                    // read block
                    std::vector<double> vdIn( count[0] * count[1] ); 
                    #pragma omp critical(accessFile)
                    {
                        dsIn->readDatasetBlock( { offset[0], offset[1]}, { count[0], count[1]}, stride, block, vdIn.data() );
                    }
                    Eigen::MatrixXd data = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> (vdIn.data(), count[0], count[1] );
                    
                    if(bycols == true) // We have to do it by rows
                    {
                        for( int row = 0; row<data.rows(); row++)  // COMPLETE EXECUTION
                        {
                            std::map<double, double> myMap;
                            myMap = VectortoOrderedMap_SNP_counts(data.row(row));
                            
                            Eigen::VectorXd ev = data.row(row);
                            std::vector<double> v(ev.data(), ev.data() + ev.size());
                            
                            auto it = std::find_if(std::begin(v), std::end(v), [](int i){return i == 3;});
                            while (it != std::end(v)) {
                                
                                if(*it==3) *it = get_value_to_impute_discrete(myMap);
                                it = std::find_if(std::next(it), std::end(v), [](int i){return i == 3;});
                            }
                            
                            Eigen::VectorXd X = Eigen::Map<Eigen::VectorXd>(v.data(), v.size());
                            data.row(row) = X;
                            
                        }
                        
                    } else {
                        for( int col = 0; col<data.cols(); col++) 
                        {
                            std::map<double, double> myMap;
                            myMap = VectortoOrderedMap_SNP_counts(data.col(col));
                            
                            Eigen::VectorXd ev = data.col(col);
                            std::vector<double> v(ev.data(), ev.data() + ev.size());
                            
                            auto it = std::find_if(std::begin(v), std::end(v), [](int i){return i == 3;});
                            while (it != std::end(v)) {
                                if(*it==3) *it = get_value_to_impute_discrete(myMap);
                                it = std::find_if(std::next(it), std::end(v), [](int i){return i == 3;});
                            }
                            
                            Eigen::VectorXd X = Eigen::Map<Eigen::VectorXd>(v.data(), v.size());
                            data.col(col) = X;
                            
                        }
                    }
                    
                    #pragma omp critical(accessFile)
                    {
                        dsOut->writeDatasetBlock( Rcpp::wrap(data), offset, count, stride, block, false);
                    }
                }
                
            }
            
        } catch( H5::FileIException& error) { // catch failure caused by the H5File operations
            checkClose_file(dsIn, dsOut);
            Rf_error("c++ exception Rcpp_Impute_snps_hdf5 (File IException)");
        } catch( H5::DataSetIException& error) { // catch failure caused by the DataSet operations
            checkClose_file(dsIn, dsOut);
            Rf_error("c++ exception Rcpp_Impute_snps_hdf5 (DataSet IException)");
        } catch( H5::GroupIException& error) { // catch failure caused by the Group operations
            checkClose_file(dsIn, dsOut);
            Rf_error("c++ exception Rcpp_Impute_snps_hdf5 (Group IException)");
        } catch( H5::DataSpaceIException& error) { // catch failure caused by the DataSpace operations
            checkClose_file(dsIn, dsOut);
            Rf_error("c++ exception Rcpp_Impute_snps_hdf5 (DataSpace IException)");
        } catch( H5::DataTypeIException& error) { // catch failure caused by the DataSpace operations
            checkClose_file(dsIn, dsOut);
            Rf_error("c++ exception Rcpp_Impute_snps_hdf5 (Data TypeIException)");
        } catch(std::exception &ex) {
            checkClose_file(dsIn, dsOut);
            Rf_error( "c++ exception Rcpp_Impute_snps_hdf5 : %s", ex.what());
        } catch (...) {
            checkClose_file(dsIn, dsOut);
            Rf_error("C++ exception Rcpp_Impute_snps_hdf5 (unknown reason)");
        }
        
        return void();
    }

7 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = Rcpp_Impute_snps_hdf5(...);