OHDSI · egillax · Feb 10, 2025 · Aug 22, 2024 · Aug 26, 2024 · Aug 27, 2024
diff --git a/.github/workflows/R_CDM_check_hades.yaml b/.github/workflows/R_CDM_check_hades.yaml
@@ -74,17 +74,17 @@ jobs:
           while read -r cmd
           do
             eval sudo $cmd
-          done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "22.04"))')
+          done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "24.04"))')
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
           cache: always
           extra-packages: any::rcmdcheck
           needs: check
-          
+
       - name: setup r-reticulate venv
         run: | 
-          uv pip install polars tqdm connectorx pyarrow pynvml numpy
+          uv pip install polars tqdm connectorx pyarrow duckdb pynvml numpy
           uv pip install torch --index https://download.pytorch.org/whl/cpu/
 
       - uses: r-lib/actions/check-r-package@v2

diff --git a/.github/workflows/R_CMD_check_main_weekly.yaml b/.github/workflows/R_CMD_check_main_weekly.yaml
@@ -20,33 +20,7 @@ jobs:
       GITHUB_PAT: ${{ secrets.GH_TOKEN }}
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
       RSPM: ${{ matrix.config.rspm }}
-      CDM5_ORACLE_CDM_SCHEMA: ${{ secrets.CDM5_ORACLE_CDM_SCHEMA }}
-      CDM5_ORACLE_OHDSI_SCHEMA: ${{ secrets.CDM5_ORACLE_OHDSI_SCHEMA }}
-      CDM5_ORACLE_PASSWORD: ${{ secrets.CDM5_ORACLE_PASSWORD }}
-      CDM5_ORACLE_SERVER: ${{ secrets.CDM5_ORACLE_SERVER }}
-      CDM5_ORACLE_USER: ${{ secrets.CDM5_ORACLE_USER }}
-      CDM5_POSTGRESQL_CDM_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_CDM_SCHEMA }}
-      CDM5_POSTGRESQL_OHDSI_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_OHDSI_SCHEMA }}
-      CDM5_POSTGRESQL_PASSWORD: ${{ secrets.CDM5_POSTGRESQL_PASSWORD }}
-      CDM5_POSTGRESQL_SERVER: ${{ secrets.CDM5_POSTGRESQL_SERVER }}
-      CDM5_POSTGRESQL_USER: ${{ secrets.CDM5_POSTGRESQL_USER }}
-      CDM5_SQL_SERVER_CDM_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_CDM_SCHEMA }}
-      CDM5_SQL_SERVER_OHDSI_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_OHDSI_SCHEMA }}
-      CDM5_SQL_SERVER_PASSWORD: ${{ secrets.CDM5_SQL_SERVER_PASSWORD }}
-      CDM5_SQL_SERVER_SERVER: ${{ secrets.CDM5_SQL_SERVER_SERVER }}
-      CDM5_SQL_SERVER_USER: ${{ secrets.CDM5_SQL_SERVER_USER }}
-      CDM5_REDSHIFT_CDM_SCHEMA: ${{ secrets.CDM5_REDSHIFT_CDM_SCHEMA }}
-      CDM5_REDSHIFT_OHDSI_SCHEMA: ${{ secrets.CDM5_REDSHIFT_OHDSI_SCHEMA }}
-      CDM5_REDSHIFT_PASSWORD: ${{ secrets.CDM5_REDSHIFT_PASSWORD }}
-      CDM5_REDSHIFT_SERVER: ${{ secrets.CDM5_REDSHIFT_SERVER }}
-      CDM5_REDSHIFT_USER: ${{ secrets.CDM5_REDSHIFT_USER }}
-      CDM5_SPARK_USER: ${{ secrets.CDM5_SPARK_USER }}
-      CDM5_SPARK_PASSWORD: ${{ secrets.CDM5_SPARK_PASSWORD }}
-      CDM5_SPARK_CONNECTION_STRING: ${{ secrets.CDM5_SPARK_CONNECTION_STRING }}
-      WEBAPI_TEST_WEBAPI_URL: ${{ secrets.WEBAPI_TEST_WEBAPI_URL }}
-      WEBAPI_TEST_SECURE_WEBAPI_URL: ${{ secrets.WEBAPI_TEST_SECURE_WEBAPI_URL }}
-      WEBAPI_TEST_ADMIN_USER_NAME: ${{ secrets.WEBAPI_TEST_ADMIN_USER_NAME }}
-      WEBAPI_TEST_ADMIN_USER_PASSWORD: ${{ secrets.WEBAPI_TEST_ADMIN_USER_PASSWORD }}
+      UV_SYSTEM_PYTHON: 1
 
     steps:
       - uses: actions/checkout@v4
@@ -59,6 +33,12 @@ jobs:
         with:
           python-version: '3.11' 
 
+      - name: Install the latest version of uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+          python-version: "3.11"
+
       - uses: r-lib/actions/setup-tinytex@v2
 
       - uses: r-lib/actions/setup-pandoc@v2
@@ -69,20 +49,9 @@ jobs:
           needs: check
 
       - name: setup r-reticulate venv
-        shell: Rscript {0}
-        run: |
-          python_packages <- 
-            c("polars", "tqdm", "connectorx", "pyarrow")
-
-          library(reticulate)
-          virtualenv_create("r-reticulate", Sys.which("python"),
-            packages=python_packages)
-          virtualenv_install("r-reticulate", "torch", 
-            pip_options = c("--index-url https://download.pytorch.org/whl/cpu"))
-
-          path_to_python <- virtualenv_python("r-reticulate")
-          writeLines(sprintf("RETICULATE_PYTHON=%s", path_to_python),
-                     Sys.getenv("GITHUB_ENV"))
+        run: | 
+          uv pip install polars tqdm connectorx pyarrow duckdb pynvml numpy
+          uv pip install torch --index https://download.pytorch.org/whl/cpu/
 
       - uses: r-lib/actions/check-r-package@v2
         with:

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -41,15 +41,16 @@ Remotes:
 RoxygenNote: 7.3.2
 Encoding: UTF-8
 Config/testthat/edition: 3
-Config/testthat/parallel: TRUE
+Config/testthat/parallel: FALSE
 Config/reticulate:
   list(
     packages = list(
       list(package = "torch"),
       list(package = "polars"),
+      list(package = "duckdb"),
+      list(package = "pyarrow"),
       list(package = "tqdm"),
       list(package = "connectorx"),
-      list(package = "pyarrow"),
       list(package = "pynvml")
       )
   )
diff --git a/DeepPatientLevelPrediction.Rproj b/DeepPatientLevelPrediction.Rproj
@@ -1,4 +1,5 @@
 Version: 1.0
+ProjectId: 412fe21b-6e78-47bf-9eb3-912b0e9ceda1
 
 RestoreWorkspace: No
 SaveWorkspace: No

diff --git a/Dockerfile b/Dockerfile
@@ -34,7 +34,7 @@ RUN pip3 install uv \
     && uv pip install --system --no-cache-dir \
     connectorx \
     polars \
-    pyarrow \
+    duckdb \
     torch \
     tqdm \
     pynvml \

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(fitEstimator)
 export(gridCvDeep)
 export(predictDeepEstimator)
+export(setCustomEmbeddingModel)
 export(setDefaultResNet)
 export(setDefaultTransformer)
 export(setEstimator)

diff --git a/R/CustomEmbeddingModel.R b/R/CustomEmbeddingModel.R
@@ -0,0 +1,73 @@
+# @file CustomEmbeddingModel.R
+#
+# Copyright 2024 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#' Create default settings a model using custom embeddings
+#'
+#' @description A model that uses custom embeddings such as Poincare embeddings or 
+#' embeddings form a foundation model
+#' @param embeddingFilePath path to the saved embeddings. The embeddings file 
+#' should be a pytorch file including a dictionary with two two fields: 
+#' `concept_ids`: a pytorch long tensor with the concept ids and `embeddings`: 
+#' a pytorch float tensor with the embeddings
+#' @param modelSettings for the model to use, needs to have an embedding layer 
+#' with a name `embedding` which will be replaced by the custom embeddings
+#' @param embeddingsClass the class of the custom embeddings, e.g. `CustomEmbeddings` 
+#' or `PoincareEmbeddings`
+#' 
+#' @return settings for a model using custom embeddings
+#'
+#' @export
+setCustomEmbeddingModel <- function(
+    embeddingFilePath,
+    modelSettings = setTransformer(
+      numBlocks = 3,
+      dimToken = 16,
+      dimOut = 1,
+      numHeads = 4,
+      attDropout = 0.2,
+      ffnDropout = 0.1,
+      resDropout = 0.0,
+      dimHidden = 32,
+      estimatorSettings = setEstimator(learningRate = "auto",
+                                       weightDecay = 1e-4,
+                                       batchSize = 256,
+                                       epochs = 2,
+                                       seed = NULL,
+                                       device = "cpu"),
+      hyperParamSearch = "random",
+      randomSample = 1
+    ),
+    embeddingsClass = "CustomEmbeddings"
+) {
+  embeddingFilePath <- normalizePath(embeddingFilePath)
+  checkIsClass(embeddingFilePath, "character")
+  checkFileExists(embeddingFilePath)
+  checkIsClass(embeddingsClass, "character")
+  checkInStringVector(embeddingsClass, c("CustomEmbeddings", "PoincareEmbeddings"))
+
+  path <- system.file("python", package = "DeepPatientLevelPrediction")
+  modelSettings$estimatorSettings$initStrategy <-
+    reticulate::import_from_path("InitStrategy",
+                                 path = path)$CustomEmbeddingInitStrategy(
+                                   embedding_class = embeddingsClass,
+                                   embedding_file = embeddingFilePath
+                                 )
+  transformerSettings <- modelSettings
+
+  attr(transformerSettings, "settings")$name <- "CustomEmbeddingModel"
+  return(transformerSettings)
+}
diff --git a/R/Dataset.R b/R/Dataset.R
@@ -36,7 +36,7 @@ createDataset <- function(data, labels, plpModel = NULL) {
       r_to_py(as.array(which(plpModel$covariateImportance$isNumeric)))
     data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
       numerical_features = numericalFeatures
-    )
+      )
   }
 
   return(data)

diff --git a/R/Estimator.R b/R/Estimator.R
@@ -216,7 +216,7 @@ fitEstimator <- function(trainData,
       as.integer()
   )
   covariateRef <- covariateRef %>%
-    dplyr::arrange("columnId") %>%
+    dplyr::arrange(.data$columnId) %>%
     dplyr::collect() %>%
     dplyr::mutate(
       included = incs,
@@ -225,6 +225,7 @@ fitEstimator <- function(trainData,
     )
 
   comp <- start - Sys.time()
+  modelSettings$estimatorSettings$initStrategy <- NULL
   result <- list(
     model = cvResult$estimator,
     preprocessing = list(
@@ -300,7 +301,17 @@ predictDeepEstimator <- function(plpModel,
     plpModel <- list(model = plpModel)
     attr(plpModel, "modelType") <- "binary"
   }
-  if ("plpData" %in% class(data)) {
+
+  if (!is.null(plpModel$covariateImportance)) {
+    # this means that the model finished training since only in the end covariateImportance is added
+    mappedData <- PatientLevelPrediction::MapIds(data$covariateData,
+                                                 cohort = cohort,
+                                                 mapping = plpModel$covariateImportance %>%
+                                                   dplyr::select("columnId", "covariateId")
+    )
+    data <- createDataset(mappedData, plpModel = plpModel)
+
+  } else if ("plpData" %in% class(data)) {
     mappedData <- PatientLevelPrediction::MapIds(data$covariateData,
       cohort = cohort,
       mapping = plpModel$covariateImportance %>%
@@ -314,18 +325,20 @@ predictDeepEstimator <- function(plpModel,
   if (is.character(plpModel$model)) {
     model <- torch$load(file.path(plpModel$model,
                                   "DeepEstimatorModel.pt"),                        
-                        map_location = "cpu")
+                        map_location = "cpu",
+                        weights_only = FALSE)
     if (is.null(model$model_parameters$model_type)) {
       # for backwards compatibility
       model$model_parameters$model_type <- plpModel$modelDesign$modelSettings$modelType
     }
     model$estimator_settings$device <-
       plpModel$modelDesign$modelSettings$estimatorSettings$device
+    modelParameters <- snakeCaseToCamelCaseNames(model$model_parameters)
+    estimatorSettings <- snakeCaseToCamelCaseNames(model$estimator_settings)
+    parameters <- list(modelParameters = modelParameters,
+                       estimatorSettings = estimatorSettings)
     estimator <-
-      createEstimator(modelParameters =
-                      snakeCaseToCamelCaseNames(model$model_parameters),
-                      estimatorSettings =
-                      snakeCaseToCamelCaseNames(model$estimator_settings))
+      createEstimator(parameters = parameters)
     estimator$model$load_state_dict(model$model_state_dict)
     prediction$value <- estimator$predict_proba(data)
   } else {
@@ -424,7 +437,7 @@ gridCvDeep <- function(mappedData,
     dplyr::select(-"index")
   prediction$cohortStartDate <- as.Date(prediction$cohortStartDate,
     origin = "1970-01-01")
-  numericalIndex <- dataset$get_numerical_features()
+  numericalIndex <- dataset$numerical_features$to_list()
 
   # save torch code here
   if (!dir.exists(file.path(modelLocation))) {
@@ -437,7 +450,7 @@ gridCvDeep <- function(mappedData,
       prediction = prediction,
       finalParam = finalParam,
       paramGridSearch = paramGridSearch,
-      numericalIndex = numericalIndex$to_list()
+      numericalIndex = numericalIndex
     )
   )
 }
@@ -469,22 +482,20 @@ evalEstimatorSettings <- function(estimatorSettings) {
   estimatorSettings
 }
 
-createEstimator <- function(modelParameters,
-                            estimatorSettings) {
+createEstimator <- function(parameters) {
   path <- system.file("python", package = "DeepPatientLevelPrediction")
   model <-
-    reticulate::import_from_path(modelParameters$modelType,
-                                 path = path)[[modelParameters$modelType]]
+    reticulate::import_from_path(parameters$modelParameters$modelType,
+                                 path = path)[[parameters$modelParameters$modelType]]
   estimator <- reticulate::import_from_path("Estimator", path = path)$Estimator
 
-  modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
-  estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
-  estimatorSettings <- evalEstimatorSettings(estimatorSettings)
-
+  parameters$modelParameters <- camelCaseToSnakeCaseNames(parameters$modelParameters)
+  parameters$estimatorSettings <- camelCaseToSnakeCaseNames(parameters$estimatorSettings)
+  parameters$estimatorSettings <- evalEstimatorSettings(parameters$estimatorSettings)
+  parameters <- camelCaseToSnakeCaseNames(parameters)
   estimator <- estimator(
     model = model,
-    model_parameters = modelParameters,
-    estimator_settings = estimatorSettings
+    parameters = parameters
   )
   return(estimator)
 }
@@ -575,16 +586,19 @@ doCrossValidationImpl <- function(dataset,
   )]
   currentModelParams <- parameters[modelSettings$modelParamNames]
   attr(currentModelParams, "metaData")$names <-
-    modelSettings$modelParamNameCH
+    modelSettings$modelParamNames
   currentModelParams$modelType <- modelSettings$modelType
   currentEstimatorSettings <-
     fillEstimatorSettings(modelSettings$estimatorSettings,
                           fitParams,
                           parameters)
-  currentModelParams$catFeatures <- dataset$get_cat_features()$max()
-  currentModelParams$numFeatures <- dataset$get_numerical_features()$len()
+  currentModelParams$feature_info <- dataset$get_feature_info()
+  currentParameters <- list(
+    modelParameters = currentModelParams,
+    estimatorSettings = currentEstimatorSettings
+  )
   if (currentEstimatorSettings$findLR) {
-    lr <- getLR(currentModelParams, currentEstimatorSettings, dataset)
+    lr <- getLR(currentParameters, dataset)
     ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr))
     currentEstimatorSettings$learningRate <- lr
   }
@@ -609,8 +623,7 @@ doCrossValidationImpl <- function(dataset,
     testDataset <- torch$utils$data$Subset(dataset,
                                            indices =
                                              as.integer(which(fold == i) - 1))
-    estimator <- createEstimator(modelParameters = currentModelParams,
-                                 estimatorSettings = currentEstimatorSettings)
+    estimator <- createEstimator(currentParameters)
     fit_estimator(estimator, trainDataset, testDataset)
 
     ParallelLogger::logInfo("Calculating predictions on left out fold set...")
@@ -663,8 +676,7 @@ trainFinalModel <- function(dataset, finalParam, modelSettings, labels) {
 
     fitParams <- names(finalParam)[grepl("^estimator", names(finalParam))]
 
-    modelParams$catFeatures <- dataset$get_cat_features()$max()
-    modelParams$numFeatures <- dataset$get_numerical_features()$len()
+    modelParams$featureInfo <- dataset$get_feature_info()
     modelParams$modelType <- modelSettings$modelType
 
     estimatorSettings <- fillEstimatorSettings(
@@ -673,8 +685,11 @@ trainFinalModel <- function(dataset, finalParam, modelSettings, labels) {
       finalParam
     )
     estimatorSettings$learningRate <- finalParam$learnSchedule$LRs[[1]]
-    estimator <- createEstimator(modelParameters = modelParams,
-                                 estimatorSettings = estimatorSettings)
+    parameters <- list(
+      modelParameters = modelParams,
+      estimatorSettings = estimatorSettings
+    )
+    estimator <- createEstimator(parameters = parameters)
     estimator$fit_whole_training_set(dataset, finalParam$learnSchedule$LRs)
 
     ParallelLogger::logInfo("Calculating predictions on all train data...")