Exercises

Exercise 1: Server connection

  • Build a connection object to a single study server: UMF Cluj (https://192.168.1.200:8005)
  • Login using the connection object
library(DSI)
library(DSOpal)
library(dsBaseClient)
builder <- DSI::newDSLoginBuilder()
builder$append(server = "verona",
              url = "https://192.168.1.50:8890",
              user = "user_analisis", password = "********")
logindata <- builder$build()
library(httr);set_config(config(ssl_verifypeer = 0L))
connections <- DSI::datashield.login(logins = logindata)

Exercise 2: Assign data

Using the connection object from the Exercise 1:

  • Find the available projects.
  • Find the available resources inside the projects.
  • Load and resolve the resouce.
  • What type of object is the loaded resource?
  • What are the names of the available variables in that object?
  • How many individuals does it contain? And variables?
o <- opalr::opal.login(username = "user_analisis",
                  password = "*********",
                  url = "https://192.168.1.50:8890")
opalr::opal.projects(o)
opalr::opal.resources(o, "UMF_Cluj")
opalr::opal.resources(o, "S_uncover")
DSI::datashield.assign.resource(connections, "resource", "S_uncover.verona")
DSI::datashield.assign.expr(conns = connections, symbol = "data",
                            expr = "as.resource.data.frame(resource)")
ds.class("data")
ds.colnames("data")
ds.dim("data")

Exercise 3: Data validation

Using the connection object from the Exercise 1 and 2:

  • What is the class of the variables CSXRRA and TRXTR?
  • What is the range of the variable CSXRRA?
  • What are the categories of the variable TRXTR?
ds.class("data$CSXRRA")
ds.class("data$TRXTR")

ds.table("data$TRXTR")
ds.summary("data$CSXRRA")

Exercise 4: Data wranggling

Using the connection object from the Exercise 1 and 2:

  • Select 4 different categorical variables (Yes/No)
  • Create a new variable COUNTS that has the count for Yes.
  • Recode the variable with the levels 1, 2+.
  • Add the new variable to the original dataset.
ds.class("data$TRXIS")
ds.class("data$DSXIC")
ds.class("data$CMXDI")
ds.class("data$CMXHT")
ds.table("data$TRXIS")
ds.table("data$DSXIC")
ds.table("data$CMXDI")
ds.table("data$CMXHT")

variables <- c("TRXIS", "DSXIC", 
               "CMXDI", "CMXHT")

for (x in variables){
  ds.recodeValues(var.name = paste0("data$", x), 
                  values2replace.vector = c("Yes", "No"), 
                  new.values.vector = c(1, 0),
                  newobj = paste0(x, "_recoded"))
}

for (x in variables){
  ds.asNumeric(x.name = paste0(x, "_recoded"), 
               newobj = paste0(x, "_recoded_num"))
}

ds.dataFrame(x = paste0(variables, "_recoded_num"), 
             newobj = "joint_comorbidities")
             
ds.rowColCalc(x = "joint_comorbidities", 
              operation = "rowSums", 
              newobj = "new_variable")
              
ds.asFactor(input.var.name = "new_variable", 
            newobj.name = "new_variable_factor")

ds.recodeValues(var.name = "new_variable_factor", 
                values2replace.vector = c("0", "1", "2", "3", "4"),
                new.values.vector = c("0", "1", "2+", "2+", "2+"), 
                newobj = "COUNTS")
                
DSI::datashield.assign.expr(connections, "data", "cbind(data, COUNTS)")

Exercise 5: Descriptive analysis

Using the connection object from the Exercise 1 and 2:

  • Perform a boxplot of the variable CSXRRA.
  • Perform a boxplot of the variable CSXRRA grouped by CMXCPD.
  • Perform a boxplot of the variable CSXRRA grouped by CMXCPD and CMXCLD.
  • Calculate contingency table of the variables CMXCPD and CMXCLD.
ds.boxPlot("data$CSXRRA")

ds.asFactor(input.var.name = "data$CMXCLD", newobj.name = "CMXCLD_factor")
DSI::datashield.assign.expr(connections, "data", "cbind(data, CMXCLD_factor)")
ds.asFactor(input.var.name = "data$CMXCPD", newobj.name = "CMXCPD_factor")
DSI::datashield.assign.expr(connections, "data", "cbind(data, CMXCPD_factor)")

ds.boxPlot(x = "data", variables = "CSXRRA", group = "CMXCPD_factor")
ds.boxPlot(x = "data", variables = "CSXRRA", group = "CMXCPD_factor", group2 = "CMXCLD_factor")

ds.table("data$CMXCPD", "data$CMXCLD")

Exercise 6: Statistical models

Using the connection object from the Exercise 1 and 2:

  • Fit a GLM (gaussian) with the model LBXSC3SIHn ~ TRXIS. (Model for illustrating purpose, not to answer any cientific question)
  • Fit a GLM (Poisson) with the model DATLGT ~ COUNTS (COUNTS is the variable created on the exercise 4)
ds.glm(formula = "LBXSC3SIHn ~ TRXIS", data = "data", family = "gaussian")
ds.glm(formula = "DATLGT ~ COUNTS", data = "data", family = "poisson")

Exercise 7: Extra

Using the connection object from the Exercise 1 and 2:

  • Do a variable selection (Lasso regression) without the date variables.
  • Do a survival analysis with the variables of the Lasso regression.
library(dsSurvivalClient)
types <- lapply(ds.colnames("data_complete")[[1]], function(x){
  ds.class(paste0("data_complete$", x))[[1]][1]
})
types <- unlist(types)
`%notin%` <- Negate(`%in%`)
indexes_to_remove <- which(types %notin% "numeric")
times <- ds.dim("data_complete")[[1]][1]
ds.rep(x1 = 1,
            times = times,
            source.times = "c",
            source.each = "c",
            newobj = "ONES")

ds.dataFrameSubset(df.name = 'data_complete',  V1.name = "ONES",  V2.name = "ONES",  Boolean.operator = "==",keep.cols = NULL,  rm.cols = indexes_to_remove,  keep.NAs = NULL,  newobj = 'data_complete_numeric',  datasources = connections, notify.of.progress = FALSE)
ds.assign(toAssign='data_complete_numeric$DSXOS_recoded_num', newobj='Y', datasources = connections)

ds.dataFrameSubset(df.name = 'data_complete_numeric',  V1.name = "ONES",  V2.name = "ONES",  Boolean.operator = "==",keep.cols = NULL,  rm.cols = c(1, 2, 5, 6, 9, 110, 111, 112),  keep.NAs = NULL,  newobj = 'X',  datasources = connections, notify.of.progress = FALSE)
ds.asMatrix(x.name = 'Y', newobj = 'Y')
ds.asMatrix(x.name = 'X', newobj = 'X')

opts=list();opts$init=0; opts$maxIter=10; opts$tol=0.01; opts$ter=2;

set.seed(123)
m1=dsMTLClient::ds.LS_Lasso(X='X', Y='Y', lam=0.5, C=0, opts, datasources=connections, nDigits=15)

variables_interest <- ds.colnames("X")[[1]][which(m1$w!=0)]

ds.make(toAssign = "data$DSXOS_recoded_num", newobj = "EVENT")
ds.make(toAssign = "data$DATLGT", newobj = "SURVTIME")

formula <- paste0("survival::Surv(time=SURVTIME,event=EVENT)~",
                  paste("data$", variables_interest, collapse = "+", sep = ""))

dsSurvivalClient::ds.coxph.SLMA(formula = formula,
                                dataName = 'data',
                                datasources = connections)

dsSurvivalClient::ds.survfit(formula = formula, objectname = "scurves")
library(survival)
dsSurvivalClient::ds.plotsurvfit(formula = "scurves")