Rcpp_Import_File_to_hdf5

C++ Function Reference

1 Signature

void BigDataStatMeth::Rcpp_Import_File_to_hdf5(Rcpp::CharacterVector filename, BigDataStatMeth::hdf5Dataset *dsOut, Rcpp::Nullable< std::string > sep=R_NilValue, Rcpp::Nullable< bool > header=false, Rcpp::Nullable< bool > rownames=false, Rcpp::Nullable< bool > bparal=R_NilValue, Rcpp::Nullable< int > threads=R_NilValue)

2 Description

Imports data from a file into an HDF5 dataset.

3 Parameters

  • filename (Rcpp::CharacterVector): Path to the input file
  • dsOut (BigDataStatMeth::hdf5Dataset *): Pointer to the HDF5 dataset where data will be stored
  • sep (Rcpp::Nullable< std::string >): Optional separator character (defaults to tab)
  • header (Rcpp::Nullable< bool >): Optional flag indicating presence of header row (defaults to false)
  • rownames (Rcpp::Nullable< bool >): Optional flag indicating presence of row names (defaults to false)
  • bparal (Rcpp::Nullable< bool >): Optional flag for parallel processing
  • threads (Rcpp::Nullable< int >): Optional number of threads for parallel processing

4 Details

filenamePath to the input file dsOutPointer to the HDF5 dataset where data will be stored sepOptional separator character (defaults to tab) headerOptional flag indicating presence of header row (defaults to false) rownamesOptional flag indicating presence of row names (defaults to false) bparalOptional flag for parallel processing threadsOptional number of threads for parallel processingPerformance optimized through block-wise reading and processing Automatically adjusts block size based on number of columns Supports parallel processing for improved performance on large datasetshdf5Dataset for the dataset structure

5 Call Graph

Function dependencies

6 Source Code

File: inst/include/hdf5Utilities/hdf5ImportFiles.hppLines 173-403

inline void Rcpp_Import_File_to_hdf5( Rcpp::CharacterVector filename,
                                   BigDataStatMeth::hdf5Dataset* dsOut,
                                   Rcpp::Nullable<std::string> sep = R_NilValue,
                                   Rcpp::Nullable<bool> header = false,
                                   Rcpp::Nullable<bool> rownames = false,
                                   Rcpp::Nullable<bool> bparal = R_NilValue,
                                   Rcpp::Nullable<int> threads = R_NilValue)
    {

        try {

            std::string path = Rcpp::as<std::string>(filename);
            std::string stdsep;

            // Colnames and rownames
            Rcpp::CharacterVector svrcolnames;

            // Blocks control
            double counter = 0;
            double blockCounter = 1000;


            if(sep.isNull()){
                stdsep = "\t";
            }else {
                stdsep = Rcpp::as<std::string>(sep);
            }

            std::string delim = "[^" + stdsep  + "]+";
            std::regex reg_expres(delim);


            std::string line;

            std::ifstream inFile(path.c_str()); //Opens the file. c_str is mandatory here so that ifstream accepts the string path
            std::getline(inFile,line,'\n'); //skip the first line (col names in our case). Remove those lines if note necessary

            // Number of columns
            std::ptrdiff_t const icols(std::distance(
                    std::sregex_iterator(line.begin(), line.end(), reg_expres),
                    std::sregex_iterator()));

            hsize_t incols = icols;

            if(Rcpp::as<bool>(rownames) == true) {
                // Read next line and count number of columns again depending on how file is created we can have
                // one empty space for rownames or not, then colnames will be different (-1 difference)
                std::getline(inFile,line,'\n'); //skip the first line (col names in our case). Remove those lines if not necessary

                // Number of columns
                std::ptrdiff_t const icols2(std::distance(
                        std::sregex_iterator(line.begin(), line.end(), reg_expres),
                        std::sregex_iterator()));

                if(icols2 == icols){
                    incols = icols-1; // Reduce in one the number of columns
                } else if ( icols == icols2 -1){
                    incols = icols;
                } else {
                    Rcpp::stop("Number of columns and headers are different, please review data, note that fields without values are not allowed");
                    // Rcpp::warning("Number of columns and headers are different, review data");
                    
                }
            }
            
            // Re-adjust block size
            if(incols < 100 ){
                blockCounter = 10000;
            }

            //. 2025/01/15.// // Get number of rows (+1 to take in to account the last line without \n)
            //. 2025/01/15.// int irows = std::count(std::istreambuf_iterator<char>(inFile),
            //. 2025/01/15.//                        std::istreambuf_iterator<char>(), '\n') + 1 ;
            
            
            // Get number of rows 
            int irows = std::count(std::istreambuf_iterator<char>(inFile),
                                   std::istreambuf_iterator<char>(), '\n') ;
            
            // +1 to take in to account the last line without \n
            if( get_NewLineEnding(path.c_str()) == false ) {
                irows = irows + 1;
            }

            // Restore counter after read first line to get number of cols
            if( Rcpp::as<bool>(header)==false ){
                irows = irows + 1;
            }

            Rcpp::CharacterVector svrownames(irows);

            // Reset iterator to beginning
            inFile.clear();
            inFile.seekg(0);

            // Read again the first line if header = true
            line.clear();

            // If data contains header :  Store first row as a colnames and reads next line
            if(Rcpp::as<bool>(header) == true) {
                std::getline(inFile,line,'\n');
                svrcolnames = Rcpp::wrap( get_SplitData_in_vectorString(line, reg_expres));
                // If rownames then remove first column from header (belonging to the rownames)
                if(Rcpp::as<bool>(rownames) == true) {
                    // if( incols ==  svrcolnames.size() || incols ==  (svrcolnames.size()-1)){
                    if (incols == static_cast<hsize_t>(svrcolnames.size()) ||
                        incols == static_cast<hsize_t>(svrcolnames.size() - 1)) {
                        svrcolnames.erase(0);}
                }
                // Read next line
                line.clear();
                std::getline(inFile,line,'\n');
            }

            dsOut->createDataset( (hsize_t)irows, (hsize_t)incols, "real");
            
            std::vector<std::string> strBlockValues;
            std::vector<hsize_t> stride = {1,1},
                                 block = {1,1},
                                 count = { (hsize_t)incols, (hsize_t)irows},
                                 offset = {0,0};

            bool btowrite = false;
            std::vector<std::string> strValues;
            
            while( !inFile.eof()  )
            {

                std::stringstream is(line); // take the line into a stringstream

                btowrite = true;
                
                // Get splitted values
                boost::split(strValues, line, boost::is_any_of(delim), boost::token_compress_on);
                
                if( Rcpp::as<bool>(rownames) == true ) {
                    
                    svrownames[counter] =  strValues.front();
                    strValues.erase(strValues.begin());
                }

                // Concatenate Valutes to get a block with several rows
                std::move(strValues.begin(), strValues.end(), std::back_inserter(strBlockValues));

                // Empty vector
                strValues.clear();

                // Write block
                if( counter>0 && (int)counter % (int)blockCounter == 0)
                {
                    count[1] = strBlockValues.size() / incols;

                    std::vector<double> doubleVector = get_data_as_Matrix(strBlockValues);
                    
                    double *p = doubleVector.data();
                    
                    Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>> resMat (p, incols, strBlockValues.size() / incols );
                    dsOut-> writeDatasetBlock( Rcpp::wrap(resMat.transpose()), offset, count, stride, block, false);

                    offset[1] = offset[1] + (strBlockValues.size() / incols);
                    
                    // Clear Vector
                    strBlockValues.clear();
                    btowrite = false;

                }

                // Clear Buffer and Read next line
                line.clear();
                std::getline(inFile,line,'\n');

                // Increment counter
                counter++;

            }
            
            count[1] = strBlockValues.size() / incols;

            if((irows - (floor(irows/blockCounter)*blockCounter)>0 && strBlockValues.size()>0) || btowrite == true)
            {
                std::vector<double> doubleVector = get_data_as_Matrix(strBlockValues);
                
                double *p = doubleVector.data();
                Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>> resMat (p, incols, count[1] );
                
                dsOut-> writeDatasetBlock( Rcpp::wrap(resMat.transpose()), offset, count, stride, block, false);
                    
            }

            
            BigDataStatMeth::hdf5Dims* dsdims;
            dsdims = new BigDataStatMeth::hdf5Dims(dsOut);
            
            if( Rcpp::as<bool>(rownames) == true || Rcpp::as<bool>(header) == true ) {
                if( Rcpp::as<bool>(rownames) == false){
                    Rcpp::StringVector svrownames(1);
                    dsdims->writeDimnames( Rcpp::wrap(svrownames), Rcpp::wrap(svrcolnames));
                } else if(Rcpp::as<bool>(header) == false){
                    Rcpp::StringVector svrcolnames(1);
                    dsdims->writeDimnames( svrownames, svrcolnames);
                } else {
                    // Write rownames and colnames
                    dsdims->writeDimnames( svrownames, svrcolnames);
                }
            }
            
            delete dsdims;

        } catch( H5::FileIException& error ) {
            Rcpp::Rcerr<<"c++ exception Convert_text_to_HDF5 (File IException)" << std::endl;
            return void();
        } catch( H5::GroupIException& error ) {
            Rcpp::Rcerr<<"c++ exception Convert_text_to_HDF5 (Group IException)" << std::endl;
            return void();
        } catch( H5::DataSetIException& error ) {
            Rcpp::Rcerr<<"c++ exception Convert_text_to_HDF5 (DataSet IException)" << std::endl;
            return void();
        } catch(const std::runtime_error& re) {
            Rcpp::Rcerr << "Runtime error: " << re.what() << std::endl;
            return void();
        } catch(const std::exception& ex) {
            Rcpp::Rcerr << "Error occurred: " << ex.what() << std::endl;
            return void();
        } catch(...) {
            Rcpp::Rcerr << "Unknown failure occurred. Possible memory corruption" << std::endl;
            return void();
        }

        return void();

    }

7 Usage Example

#include "BigDataStatMeth.hpp"

// Example usage
auto result = Rcpp_Import_File_to_hdf5(...);