12 Basic statistical analyses
Let us start by illustrating how to peform simple statistical data analyses using different resources. Here, we will use data from three studies that are available in our Opal demo repository. The three databases are called CNSIM1, CNSIM2, CNSIM3 and are available as three different resources: mySQL database, SPSS file and CSV file (see Figure 6.2). This example mimics real situations where different hospitals or research centers manage their own databases containing harmonized data. Data correspond to three simulated datasets with different numbers of observations of 11 harmonized variables. They contain synthetic data based on a model derived from the participants of the 1958 Birth Cohort, as part of an obesity methodological development project. This dataset does contain some NA values. The available variables are:
Variable | Description | Type | Note |
---|---|---|---|
LAB_TSC | Total Serum Cholesterol | numeric | mmol/L |
LAB_TRIG | Triglycerides | numeric | mmol/L |
LAB_HDL | HDL Cholesterol | numeric | mmol/L |
LAB_GLUC_ADJUSTED | Non-Fasting Glucose | numeric | mmol/L |
PM_BMI_CONTINUOUS | Body Mass Index (continuous) | numeric | kg/m2 |
DIS_CVA | History of Stroke | factor | 0 = Never had stroke; 1 = Has had stroke |
MEDI_LPD | Current Use of Lipid Lowering Medication (from categorical assessment item) | factor | 0 = Not currently using lipid lowering medication; 1 = Currently using lipid lowering medication |
DIS_DIAB | History of Diabetes | factor | 0 = Never had diabetes; 1 = Has had diabetes |
DIS_AMI | History of Myocardial Infarction | factor | 0 = Never had myocardial infarction; 1 = Has had myocardial infarction |
GENDER | Gender | factor | 0 = Female |
PM_BMI_CATEGORICAL | Body Mass Index (categorical) | factor | 1 = Less than 25 kg/m2; 2 = 25 to 30 kg/m2; 3 = Over 30 kg/m2 |
The analyses that are described here, can also be found in the DataSHIELD Tutorial where these resources here uploaded to the Opal server as three tables, an inferior approach since data have to be moved from their original repositories.
12.1 Analysis from a single study
Let us start by illustrating how to analyze one data set (CNSIM2).
library(DSOpal)
library(dsBaseClient)
# prepare login data and resource to assign
builder <- DSI::newDSLoginBuilder()
builder$append(server = "study1", url = "https://opal-demo.obiba.org",
user = "dsuser", password = "password",
resource = "RSRC.CNSIM1", driver = "OpalDriver")
logindata <- builder$build()
# login and assign resource
conns <- DSI::datashield.login(logins = logindata, assign = TRUE,
symbol = "res")
# coerce ResourceClient objects to a data.frame called 'D'
datashield.assign.expr(conns, symbol = "D",
expr = quote(as.resource.data.frame(res, strict=TRUE)))
Then we can inspect the type of data we have
$study1
[1] "data.frame"
$study1
[1] "id" "LAB_TSC" "LAB_TRIG" "LAB_HDL" "LAB_GLUC_ADJUSTED"
[6] "PM_BMI_CONTINUOUS" "DIS_CVA" "MEDI_LPD" "DIS_DIAB" "DIS_AMI"
[11] "GENDER" "PM_BMI_CATEGORICAL"
Perform some data descriptive analyses
Data in all studies were valid
Study 1 : No errors reported from this study
$output.list
$output.list$TABLE_rvar.by.study_row.props
study
D$DIS_DIAB 1
0 1
1 1
$output.list$TABLE_rvar.by.study_col.props
study
D$DIS_DIAB 1
0 0.98613037
1 0.01386963
$output.list$TABLE_rvar.by.study_counts
study
D$DIS_DIAB 1
0 2133
1 30
$output.list$TABLES.COMBINED_all.sources_proportions
D$DIS_DIAB
0 1
0.9860 0.0139
$output.list$TABLES.COMBINED_all.sources_counts
D$DIS_DIAB
0 1
2133 30
$validity.message
[1] "Data in all studies were valid"
Data in all studies were valid
Study 1 : No errors reported from this study
$output.list
$output.list$TABLE.STUDY.1_row.props
D$GENDER
D$DIS_DIAB 0 1
0 0.502 0.498
1 0.700 0.300
$output.list$TABLE.STUDY.1_col.props
D$GENDER
D$DIS_DIAB 0 1
0 0.9810 0.9920
1 0.0192 0.0084
$output.list$TABLES.COMBINED_all.sources_row.props
D$GENDER
D$DIS_DIAB 0 1
0 0.502 0.498
1 0.700 0.300
$output.list$TABLES.COMBINED_all.sources_col.props
D$GENDER
D$DIS_DIAB 0 1
0 0.9810 0.9920
1 0.0192 0.0084
$output.list$TABLE_STUDY.1_counts
D$GENDER
D$DIS_DIAB 0 1
0 1071 1062
1 21 9
$output.list$TABLES.COMBINED_all.sources_counts
D$GENDER
D$DIS_DIAB 0 1
0 1071 1062
1 21 9
$validity.message
[1] "Data in all studies were valid"
Or even some statistical modelling. In this case we want to assess whether sex (GENDER) or triglycerides (LAB_TRIG) are risk factors for diabetes (DIS_DIAB)
Estimate Std. Error z-value p-value low0.95CI.LP high0.95CI.LP P_OR
(Intercept) -5.1696619 0.4549328 -11.363572 6.349427e-30 -6.0613138 -4.2780099 0.005654338
LAB_TRIG 0.3813891 0.1037611 3.675647 2.372471e-04 0.1780211 0.5847570 1.464317247
GENDER -0.2260851 0.4375864 -0.516664 6.053908e-01 -1.0837387 0.6315685 0.797650197
low0.95CI.P_OR high0.95CI.P_OR
(Intercept) 0.002325913 0.01368049
LAB_TRIG 1.194850574 1.79455494
GENDER 0.338328242 1.88055787
As usual the connection must be closed
12.2 Analysis from a multiple studies
Now, let us illustrate a similar analysis with multiple studies. In this case we see results aggregated across all three studies.
library(DSOpal)
library(dsBaseClient)
# prepare login data and resources to assign
builder <- DSI::newDSLoginBuilder()
builder$append(server = "study1", url = "https://opal-demo.obiba.org",
user = "dsuser", password = "password",
resource = "RSRC.CNSIM1", driver = "OpalDriver")
builder$append(server = "study2", url = "https://opal-demo.obiba.org",
user = "dsuser", password = "password",
resource = "RSRC.CNSIM2", driver = "OpalDriver")
builder$append(server = "study3", url = "https://opal-demo.obiba.org",
user = "dsuser", password = "password",
resource = "RSRC.CNSIM3", driver = "OpalDriver")
logindata <- builder$build()
# login and assign resources
conns <- datashield.login(logins = logindata, assign = TRUE, symbol = "res")
# assigned objects are of class ResourceClient (and others)
ds.class("res")
$study1
[1] "SQLResourceClient" "ResourceClient" "R6"
$study2
[1] "TidyFileResourceClient" "FileResourceClient" "ResourceClient" "R6"
$study3
[1] "TidyFileResourceClient" "FileResourceClient" "ResourceClient" "R6"
# coerce ResourceClient objects to data.frames
# (DataSHIELD config allows as.resource.data.frame() assignment function for the purpose of the demo)
datashield.assign.expr(conns, symbol = "D",
expr = quote(as.resource.data.frame(res, strict = TRUE)))
ds.class("D")
$study1
[1] "data.frame"
$study2
[1] "data.frame"
$study3
[1] "data.frame"
$study1
$study1$class
[1] "numeric"
$study1$length
[1] 2163
$study1$`quantiles & mean`
5% 10% 25% 50% 75% 90% 95% Mean
0.875240 1.047400 1.300000 1.581000 1.844500 2.090000 2.210900 1.569416
$study2
$study2$class
[1] "numeric"
$study2$length
[1] 3088
$study2$`quantiles & mean`
5% 10% 25% 50% 75% 90% 95% Mean
0.850280 1.032200 1.294000 1.563000 1.840000 2.077000 2.225000 1.556648
$study3
$study3$class
[1] "numeric"
$study3$length
[1] 4128
$study3$`quantiles & mean`
5% 10% 25% 50% 75% 90% 95% Mean
0.876760 1.039200 1.304000 1.589000 1.856000 2.098800 2.244200 1.574687
# vector types are not necessarily the same depending on the data reader that was used
ds.class('D$GENDER')
$study1
[1] "integer"
$study2
[1] "haven_labelled" "vctrs_vctr" "double"
$study3
[1] "numeric"
$all.unique.levels
[1] "0" "1"
$return.message
[1] "Data object <GENDER> correctly created in all specified data sources"
$study1
$study1$class
[1] "factor"
$study1$length
[1] 2163
$study1$categories
[1] "0" "1"
$study1$`count of '0'`
[1] 1092
$study1$`count of '1'`
[1] 1071
$study2
$study2$class
[1] "factor"
$study2$length
[1] 3088
$study2$categories
[1] "0" "1"
$study2$`count of '0'`
[1] 1585
$study2$`count of '1'`
[1] 1503
$study3
$study3$class
[1] "factor"
$study3$length
[1] 4128
$study3$categories
[1] "0" "1"
$study3$`count of '0'`
[1] 2091
$study3$`count of '1'`
[1] 2037
Estimate Std. Error z-value p-value low0.95CI.LP high0.95CI.LP P_OR
(Intercept) -4.7792110 0.21081170 -22.670521 8.755236e-114 -5.1923944 -4.36602770 0.00833261
LAB_TRIG 0.3035931 0.05487436 5.532514 3.156737e-08 0.1960414 0.41114488 1.35471774
GENDER -0.4455989 0.20797931 -2.142516 3.215202e-02 -0.8532309 -0.03796695 0.64044060
low0.95CI.P_OR high0.95CI.P_OR
(Intercept) 0.005527953 0.01254229
LAB_TRIG 1.216577226 1.50854390
GENDER 0.426036242 0.96274475