Rcpp_Remove_Low_Data_hdf5

C++ Function Reference

1 Signature

int BigDataStatMeth::Rcpp_Remove_Low_Data_hdf5(BigDataStatMeth::hdf5Dataset *dsIn, BigDataStatMeth::hdf5Dataset *dsOut, bool bycols, double pcent)

2 Description

Removes rows or columns with high percentage of missing data from HDF5 dataset.

3 Parameters

  • dsIn (BigDataStatMeth::hdf5Dataset *): Pointer to input HDF5 dataset
  • dsOut (BigDataStatMeth::hdf5Dataset *): Pointer to output HDF5 dataset where filtered data will be stored
  • bycols (bool): If true, process by columns; if false, process by rows
  • pcent (double): Threshold percentage (0.0-1.0) of missing data to trigger removal

4 Returns

int Number of rows/columns removed (negative) or error code

5 Details

This function processes an HDF5 dataset in blocks, removing rows or columns where the percentage of missing data (represented by value 3) exceeds the specified threshold. The filtered data is written to a new dataset.

6 Call Graph

Function dependencies

7 Source Code

File: inst/include/hdf5Utilities/hdf5RemoveLowData.hppLines 72-220

inline int Rcpp_Remove_Low_Data_hdf5( BigDataStatMeth::hdf5Dataset* dsIn, BigDataStatMeth::hdf5Dataset* dsOut, bool bycols, double pcent)
    {
        
        int itotrem = 0;
        
        try{
        
            int ilimit,
                blocksize = 1000;
            
            bool bcreated = false;
            
            std::vector<hsize_t> offset = {0,0},
                                 count = {0,0},
                                 stride = {1,1},
                                 block = {1,1},
                                 newoffset = {0,0};
            
            hsize_t* dims_out = dsIn->dim();
            
            // id bycols == true : read all rows by group of columns ; else : all columns by group of rows
            if (bycols == true) {
                ilimit = dims_out[0];
                count[1] = dims_out[1];
                offset[1] = 0;
            } else {
                ilimit = dims_out[1];
                count[0] = dims_out[0];
                offset[0] = 0;
            };
            
            for( int i=0; i<=(ilimit/blocksize); i++) 
            {
                int iread;
                int iblockrem = 0;
                
                if( (i+1)*blocksize < ilimit) iread = blocksize;
                else iread = ilimit - (i*blocksize);
                
                if(bycols == true) {
                    count[0] = iread; 
                    offset[0] = i*blocksize;
                } else {
                    count[1] = iread; 
                    offset[1] = i*blocksize;
                }
                
                // read block
                std::vector<double> vdCurDataset( count[0] * count[1] ); 
                dsIn->readDatasetBlock( {offset[0], offset[1]}, {count[0], count[1]}, stride, block, vdCurDataset.data() );
                Eigen::MatrixXd data = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> (vdCurDataset.data(), count[0], count[1] );
                
                
                if(bycols == true) // We have to do it by rows
                {
                    int readedrows = data.rows();
                    
                    for( int row = readedrows-1 ; row>=0; row--)  // COMPLETE EXECUTION
                    {
                        if((data.row(row).array() == 3).count()/(double)count[1]>= pcent )
                        {
                            removeRow(data, row);
                            iblockrem = iblockrem + 1;
                        } 
                    }
                    
                } else {
                    
                    int readedcols = data.cols();
                    
                    for( int col = readedcols-1 ; col>=0; col--)  // COMPLETE EXECUTION
                    { 
                        if((data.col(col).array() == 3).count()/(double)count[0]>=pcent )
                        {
                            removeColumn(data, col);
                            iblockrem = iblockrem + 1;
                        } 
                    }
                }
                
                int extendcols = data.cols();
                int extendrows = data.rows();
                
                if( bcreated == false) {
                    
                    if( extendcols > 0 && extendrows > 0){
                        dsOut->createUnlimitedDataset(extendrows, extendcols, "real");
                        dsOut->openDataset();
                        bcreated = true;
                    } 
                    
                } else {
                    if(bycols == true){
                        dsOut->extendUnlimitedDataset(extendrows, 0);
                    }else{
                        dsOut->extendUnlimitedDataset( 0, extendcols);
                    }
                }

                if( bcreated == true) {
                    std::vector<hsize_t> countblock = {(unsigned long long)extendrows, (unsigned long long)extendcols};
                    dsOut->writeDatasetBlock( Rcpp::wrap(data), newoffset, countblock, stride, block, false);
                    
                    if(bycols == true)
                        newoffset[0] =  newoffset[0] + extendrows;
                    else
                        newoffset[1] =  newoffset[1] + extendcols;
                }
                
                itotrem = itotrem - iblockrem;
                
            }
            
            if( bcreated == false ) {
                Rcpp::warning("All data removed - please adjust pcent parameter or review data");
            }
            
        } catch( H5::FileIException& error) { // catch failure caused by the H5File operations
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"c++ exception Rcpp_Remove_Low_Data_hdf5 (File IException)" << std::endl;
            return -1;
        } catch( H5::DataSetIException& error) { // catch failure caused by the DataSet operations
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"c++ exception Rcpp_Remove_Low_Data_hdf5 (DataSet IException)" << std::endl;
            return -1;
        } catch( H5::GroupIException& error) { // catch failure caused by the Group operations
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"c++ exception Rcpp_Remove_Low_Data_hdf5 (Group IException)" << std::endl;
            return -1;
        } catch( H5::DataSpaceIException& error) { // catch failure caused by the DataSpace operations
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"c++ exception Rcpp_Remove_Low_Data_hdf5 (DataSpace IException)" << std::endl;
            return -1;
        } catch( H5::DataTypeIException& error) { // catch failure caused by the DataSpace operations
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"c++ exception Rcpp_Remove_Low_Data_hdf5 (Data TypeIException)" << std::endl;
            return -1;
        } catch(std::exception &ex) {
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr << "c++ exception Rcpp_Remove_Low_Data_hdf5: " << ex.what();
            return -1;
        } catch (...) {
            checkClose_file(dsIn, dsOut);
            Rcpp::Rcerr<<"C++ exception Rcpp_Remove_Low_Data_hdf5 (unknown reason)";
            return -1;
        }
        
        return(itotrem);
    }

8 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = Rcpp_Remove_Low_Data_hdf5(...);