12 Basic statistical analyses

Let us start by illustrating how to peform simple statistical data analyses using different resources. Here, we will use data from three studies that are available in our Opal demo repository. The three databases are called CNSIM1, CNSIM2, CNSIM3 and are available as three different resources: mySQL database, SPSS file and CSV file (see Figure 6.2). This example mimics real situations where different hospitals or research centers manage their own databases containing harmonized data. Data correspond to three simulated datasets with different numbers of observations of 11 harmonized variables. They contain synthetic data based on a model derived from the participants of the 1958 Birth Cohort, as part of an obesity methodological development project. This dataset does contain some NA values. The available variables are:

Variable Description Type Note
LAB_TSC Total Serum Cholesterol numeric mmol/L
LAB_TRIG Triglycerides numeric mmol/L
LAB_HDL HDL Cholesterol numeric mmol/L
LAB_GLUC_ADJUSTED Non-Fasting Glucose numeric mmol/L
PM_BMI_CONTINUOUS Body Mass Index (continuous) numeric kg/m2
DIS_CVA History of Stroke factor 0 = Never had stroke; 1 = Has had stroke
MEDI_LPD Current Use of Lipid Lowering Medication (from categorical assessment item) factor 0 = Not currently using lipid lowering medication; 1 = Currently using lipid lowering medication
DIS_DIAB History of Diabetes factor 0 = Never had diabetes; 1 = Has had diabetes
DIS_AMI History of Myocardial Infarction factor 0 = Never had myocardial infarction; 1 = Has had myocardial infarction
GENDER Gender factor 0 = Female
PM_BMI_CATEGORICAL Body Mass Index (categorical) factor 1 = Less than 25 kg/m2; 2 = 25 to 30 kg/m2; 3 = Over 30 kg/m2

The analyses that are described here, can also be found in the DataSHIELD Tutorial where these resources here uploaded to the Opal server as three tables, an inferior approach since data have to be moved from their original repositories.

12.1 Analysis from a single study

Let us start by illustrating how to analyze one data set (CNSIM2).

library(DSOpal)
library(dsBaseClient)

# prepare login data and resource to assign
builder <- DSI::newDSLoginBuilder()
builder$append(server = "study1", url = "https://opal-demo.obiba.org", 
               user = "dsuser", password = "password", 
               resource = "RSRC.CNSIM1", driver = "OpalDriver")
logindata <- builder$build()

# login and assign resource
conns <- DSI::datashield.login(logins = logindata, assign = TRUE, 
                               symbol = "res")


# coerce ResourceClient objects to a data.frame called 'D'
datashield.assign.expr(conns, symbol = "D", 
                       expr = quote(as.resource.data.frame(res, strict=TRUE)))

Then we can inspect the type of data we have

ds.class("D")
$study1
[1] "data.frame"
ds.colnames("D")
$study1
 [1] "id"                 "LAB_TSC"            "LAB_TRIG"           "LAB_HDL"            "LAB_GLUC_ADJUSTED" 
 [6] "PM_BMI_CONTINUOUS"  "DIS_CVA"            "MEDI_LPD"           "DIS_DIAB"           "DIS_AMI"           
[11] "GENDER"             "PM_BMI_CATEGORICAL"

Perform some data descriptive analyses

ds.table("D$DIS_DIAB")

 Data in all studies were valid 

Study 1 :  No errors reported from this study
$output.list
$output.list$TABLE_rvar.by.study_row.props
          study
D$DIS_DIAB 1
         0 1
         1 1

$output.list$TABLE_rvar.by.study_col.props
          study
D$DIS_DIAB          1
         0 0.98613037
         1 0.01386963

$output.list$TABLE_rvar.by.study_counts
          study
D$DIS_DIAB    1
         0 2133
         1   30

$output.list$TABLES.COMBINED_all.sources_proportions
D$DIS_DIAB
     0      1 
0.9860 0.0139 

$output.list$TABLES.COMBINED_all.sources_counts
D$DIS_DIAB
   0    1 
2133   30 


$validity.message
[1] "Data in all studies were valid"
ds.table("D$DIS_DIAB", "D$GENDER")

 Data in all studies were valid 

Study 1 :  No errors reported from this study
$output.list
$output.list$TABLE.STUDY.1_row.props
          D$GENDER
D$DIS_DIAB     0     1
         0 0.502 0.498
         1 0.700 0.300

$output.list$TABLE.STUDY.1_col.props
          D$GENDER
D$DIS_DIAB      0      1
         0 0.9810 0.9920
         1 0.0192 0.0084

$output.list$TABLES.COMBINED_all.sources_row.props
          D$GENDER
D$DIS_DIAB     0     1
         0 0.502 0.498
         1 0.700 0.300

$output.list$TABLES.COMBINED_all.sources_col.props
          D$GENDER
D$DIS_DIAB      0      1
         0 0.9810 0.9920
         1 0.0192 0.0084

$output.list$TABLE_STUDY.1_counts
          D$GENDER
D$DIS_DIAB    0    1
         0 1071 1062
         1   21    9

$output.list$TABLES.COMBINED_all.sources_counts
          D$GENDER
D$DIS_DIAB    0    1
         0 1071 1062
         1   21    9


$validity.message
[1] "Data in all studies were valid"

Or even some statistical modelling. In this case we want to assess whether sex (GENDER) or triglycerides (LAB_TRIG) are risk factors for diabetes (DIS_DIAB)

mod <- ds.glm(DIS_DIAB ~ LAB_TRIG + GENDER, data = "D" , family="binomial")
mod$coeff
              Estimate Std. Error    z-value      p-value low0.95CI.LP high0.95CI.LP        P_OR
(Intercept) -5.1696619  0.4549328 -11.363572 6.349427e-30   -6.0613138    -4.2780099 0.005654338
LAB_TRIG     0.3813891  0.1037611   3.675647 2.372471e-04    0.1780211     0.5847570 1.464317247
GENDER      -0.2260851  0.4375864  -0.516664 6.053908e-01   -1.0837387     0.6315685 0.797650197
            low0.95CI.P_OR high0.95CI.P_OR
(Intercept)    0.002325913      0.01368049
LAB_TRIG       1.194850574      1.79455494
GENDER         0.338328242      1.88055787

As usual the connection must be closed

datashield.logout(conns)

12.2 Analysis from a multiple studies

Now, let us illustrate a similar analysis with multiple studies. In this case we see results aggregated across all three studies.

library(DSOpal)
library(dsBaseClient)

# prepare login data and resources to assign
builder <- DSI::newDSLoginBuilder()
builder$append(server = "study1", url = "https://opal-demo.obiba.org", 
               user = "dsuser", password = "password", 
               resource = "RSRC.CNSIM1", driver = "OpalDriver")
builder$append(server = "study2", url = "https://opal-demo.obiba.org", 
               user = "dsuser", password = "password", 
               resource = "RSRC.CNSIM2", driver = "OpalDriver")
builder$append(server = "study3", url = "https://opal-demo.obiba.org", 
               user = "dsuser", password = "password", 
               resource = "RSRC.CNSIM3", driver = "OpalDriver")
logindata <- builder$build()

# login and assign resources
conns <- datashield.login(logins = logindata, assign = TRUE, symbol = "res")

# assigned objects are of class ResourceClient (and others)
ds.class("res")
$study1
[1] "SQLResourceClient" "ResourceClient"    "R6"               

$study2
[1] "TidyFileResourceClient" "FileResourceClient"     "ResourceClient"         "R6"                    

$study3
[1] "TidyFileResourceClient" "FileResourceClient"     "ResourceClient"         "R6"                    
# coerce ResourceClient objects to data.frames
# (DataSHIELD config allows as.resource.data.frame() assignment function for the purpose of the demo)
datashield.assign.expr(conns, symbol = "D", 
                       expr = quote(as.resource.data.frame(res, strict = TRUE)))
ds.class("D")
$study1
[1] "data.frame"

$study2
[1] "data.frame"

$study3
[1] "data.frame"
# do usual dsBase analysis
ds.summary('D$LAB_HDL')
$study1
$study1$class
[1] "numeric"

$study1$length
[1] 2163

$study1$`quantiles & mean`
      5%      10%      25%      50%      75%      90%      95%     Mean 
0.875240 1.047400 1.300000 1.581000 1.844500 2.090000 2.210900 1.569416 


$study2
$study2$class
[1] "numeric"

$study2$length
[1] 3088

$study2$`quantiles & mean`
      5%      10%      25%      50%      75%      90%      95%     Mean 
0.850280 1.032200 1.294000 1.563000 1.840000 2.077000 2.225000 1.556648 


$study3
$study3$class
[1] "numeric"

$study3$length
[1] 4128

$study3$`quantiles & mean`
      5%      10%      25%      50%      75%      90%      95%     Mean 
0.876760 1.039200 1.304000 1.589000 1.856000 2.098800 2.244200 1.574687 
# vector types are not necessarily the same depending on the data reader that was used
ds.class('D$GENDER')
$study1
[1] "integer"

$study2
[1] "haven_labelled" "vctrs_vctr"     "double"        

$study3
[1] "numeric"
ds.asFactor('D$GENDER', 'GENDER')
$all.unique.levels
[1] "0" "1"

$return.message
[1] "Data object <GENDER> correctly created in all specified data sources"
ds.summary('GENDER')
$study1
$study1$class
[1] "factor"

$study1$length
[1] 2163

$study1$categories
[1] "0" "1"

$study1$`count of '0'`
[1] 1092

$study1$`count of '1'`
[1] 1071


$study2
$study2$class
[1] "factor"

$study2$length
[1] 3088

$study2$categories
[1] "0" "1"

$study2$`count of '0'`
[1] 1585

$study2$`count of '1'`
[1] 1503


$study3
$study3$class
[1] "factor"

$study3$length
[1] 4128

$study3$categories
[1] "0" "1"

$study3$`count of '0'`
[1] 2091

$study3$`count of '1'`
[1] 2037
mod <- ds.glm("DIS_DIAB ~ LAB_TRIG + GENDER", data = "D" , family="binomial")
mod$coeff
              Estimate Std. Error    z-value       p-value low0.95CI.LP high0.95CI.LP       P_OR
(Intercept) -4.7792110 0.21081170 -22.670521 8.755236e-114   -5.1923944   -4.36602770 0.00833261
LAB_TRIG     0.3035931 0.05487436   5.532514  3.156737e-08    0.1960414    0.41114488 1.35471774
GENDER      -0.4455989 0.20797931  -2.142516  3.215202e-02   -0.8532309   -0.03796695 0.64044060
            low0.95CI.P_OR high0.95CI.P_OR
(Intercept)    0.005527953      0.01254229
LAB_TRIG       1.216577226      1.50854390
GENDER         0.426036242      0.96274475
datashield.logout(conns)