Rcpp_Remove_MAF_hdf5

C++ Function Reference

1 Signature

int BigDataStatMeth::Rcpp_Remove_MAF_hdf5(BigDataStatMeth::hdf5Dataset *dsIn, BigDataStatMeth::hdf5Dataset *dsOut, bool bycols, double pcent, int blocksize)

2 Description

Removes rows or columns from a dataset based on MAF threshold.

3 Parameters

  • dsIn (BigDataStatMeth::hdf5Dataset *): Input HDF5 dataset containing genomic data
  • dsOut (BigDataStatMeth::hdf5Dataset *): Output HDF5 dataset for filtered data
  • bycols (bool): If true, process by columns; if false, process by rows
  • pcent (double): MAF threshold percentage (0.0 to 1.0)
  • blocksize (int): Number of rows/columns to process in each block

4 Returns

int Number of removed rows/columns (negative) or error code

5 Details

dsInInput HDF5 dataset containing genomic data dsOutOutput HDF5 dataset for filtered data bycolsIf true, process by columns; if false, process by rows pcentMAF threshold percentage (0.0 to 1.0) blocksizeNumber of rows/columns to process in each block int Number of removed rows/columns (negative) or error code Implementation approach:Processes data in blocks to manage memory usageDynamically creates and extends output datasetSupports both row-wise and column-wise filteringPreserves data structure while removing low-MAF variants

6 Call Graph

Function dependencies

7 Source Code

File: inst/include/hdf5Omics/hdf5RemoveMAF.hppLines 83-238

inline int Rcpp_Remove_MAF_hdf5( BigDataStatMeth::hdf5Dataset* dsIn, 
                            BigDataStatMeth::hdf5Dataset* dsOut, 
                            bool bycols, double pcent, int blocksize)
    {
        
        int itotrem = 0;
        
        try{
        
            bool bcreated = false;    
            int ilimit;
            
            std::vector<hsize_t> offset = {0,0},
                                 count = {0,0},
                                 stride = {1,1},
                                 block = {1,1},
                                 newoffset = {0,0};
            
            // Real data set dimension
            hsize_t* dims_out = dsIn->dim();
            
            // id bycols == true : read all rows by group of columns ; else : all columns by group of rows
            if (bycols == true) {
                ilimit = dims_out[0];
                count[1] = dims_out[1];
                offset[1] = 0;
            } else {
                ilimit = dims_out[1];
                count[0] = dims_out[0];
                offset[0] = 0;
            }
            
            for( int i=0; i <= (ilimit/blocksize); i++) 
            {
                int iread;
                int iblockrem = 0;
                
                if( (i+1)*blocksize < ilimit) iread = blocksize;
                else iread = ilimit - (i*blocksize);
                
                if(bycols == true) {
                    count[0] = iread; 
                    offset[0] = i*blocksize;
                } else {
                    count[1] = iread; 
                    offset[1] = i*blocksize;
                }
                
                // read block
                std::vector<double> vdCurDataset( count[0] * count[1] ); 
                dsIn->readDatasetBlock( {offset[0], offset[1]}, {count[0], count[1]}, stride, block, vdCurDataset.data() );
                Eigen::MatrixXd data = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> (vdCurDataset.data(), count[0], count[1] );
                
                if(bycols == true) // We have to do it by rows
                {
                    int readedrows = data.rows();
                    for( int row = readedrows-1 ; row>=0; row--)
                    {
                        if( calc_freq(Rcpp::wrap(data.row(row))) <= pcent ) {
                            removeRow(data, row);
                            iblockrem = iblockrem + 1;
                        } 
                    }
                    
                } else {
                    
                    int readedcols = data.cols();
                    for( int col = readedcols-1 ; col>=0; col--)
                    { 
                        if( calc_freq(Rcpp::wrap(data.col(col))) <= pcent ) {
                            removeColumn(data, col);
                            iblockrem = iblockrem + 1;
                        } 
                    }
                }
                
                int extendcols = data.cols();
                int extendrows = data.rows();
                
                if( extendrows>0 && extendcols>0)
                {
                    if(bcreated == false) {
                        // create_HDF5_unlimited_matrix_dataset_ptr(file, stroutdata, extendrows, extendcols, "numeric");
                        // unlimDataset = new DataSet(file->openDataSet(stroutdata));
                        // bcreated = true;
                        
                        dsOut->createUnlimitedDataset(extendrows, extendcols, "real");
                        dsOut->openDataset();
                        bcreated = true;
                        
                    } else {
                        if(bycols == true){
                            // extend_HDF5_matrix_subset_ptr(file, unlimDataset, extendrows, 0);
                            dsOut->extendUnlimitedDataset(extendrows, 0);
                        }else{
                            // extend_HDF5_matrix_subset_ptr(file, unlimDataset, 0, extendcols);
                            dsOut->extendUnlimitedDataset( 0, extendcols);
                        }
                    }
                    
                    if( bcreated == true) {
                        
                        std::vector<hsize_t> countblock = {(unsigned long long)extendrows, (unsigned long long)extendcols};
                        dsOut->writeDatasetBlock( Rcpp::wrap(data), newoffset, countblock, stride, block, false);
                        
                        if(bycols == true)
                            newoffset[0] =  newoffset[0] + extendrows;
                        else
                            newoffset[1] =  newoffset[1] + extendcols;
                    }
                    
                    // IntegerVector countblock = IntegerVector::create(extendrows, extendcols);
                    // write_HDF5_matrix_subset_v2(file, unlimDataset, newoffset, countblock, stride, block, wrap(data) );
                    
                    // if(bycols == true)
                    //     newoffset[0] =  newoffset[0] + extendrows;
                    // else
                    //     newoffset[1] =  newoffset[1] + extendcols;
                    
                    itotrem = itotrem - iblockrem;
                }
            }
            
            // if (bcreated == true) {
            //     unlimDataset->close();
            // }
            
        } catch( H5::FileIException& error ){
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"\nc++ c++ exception Rcpp_Remove_MAF_hdf5 (File IException)\n";
            return -1;
        } catch( H5::DataSetIException& error ) { 
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"\nc++ c++ exception Rcpp_Remove_MAF_hdf5 (DataSet IException)\n";
            return -1;
        } catch( H5::DataSpaceIException& error ) { 
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"\nc++ c++ exception Rcpp_Remove_MAF_hdf5 (DataSpace IException)\n";
            return -1;
        } catch( H5::DataTypeIException& error ) { 
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"\nc++ c++ exception Rcpp_Remove_MAF_hdf5 (DataType IException)\n";
            return -1;
        } catch(std::exception &ex) {
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"\nc++ c++ exception Rcpp_Remove_MAF_hdf5: "<< ex.what()<<"\n";
            return -1;
        }  catch (...) {
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"\nC++ exception Rcpp_Remove_MAF_hdf5 (unknown reason)";
            return -1;
        }
        
        return(itotrem);

    }

8 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = Rcpp_Remove_MAF_hdf5(...);