pharmaverse · MattBearham · Oct 31, 2024 · Nov 4, 2024 · Nov 4, 2024 · Nov 4, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -29,7 +29,7 @@ Description: Uses the metadata information stored in 'metacore' objects to check
 License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Imports: 
     dplyr,
     metacore (>= 0.0.4),

diff --git a/NAMESPACE b/NAMESPACE
@@ -75,6 +75,7 @@ importFrom(stringr,str_c)
 importFrom(stringr,str_count)
 importFrom(stringr,str_detect)
 importFrom(stringr,str_extract)
+importFrom(stringr,str_match_all)
 importFrom(stringr,str_remove)
 importFrom(stringr,str_remove_all)
 importFrom(stringr,str_split)

diff --git a/R/build.R b/R/build.R
@@ -16,16 +16,22 @@
 #' @param predecessor_only By default `TRUE`, so only variables with the origin
 #'   of 'Predecessor' will be used. If `FALSE` any derivation matching the
 #'   dataset.variable will be used.
-#' @param keep Boolean to determine if the original columns should be kept. By
-#'   default `FALSE`, so only the ADaM columns are kept. If `TRUE` the resulting
-#'   dataset will have all the ADaM columns as well as any SDTM column that were
-#'   renamed in the ADaM (i.e `ARM` and `TRT01P` will be in the resulting
-#'   dataset)
+#' @param keep String to determine which columns from the original datasets
+#'   should be kept
+#'   - "FALSE" (default):  only columns that are also present in the ADaM
+#'                            specification are kept in the output.
+#'   - "ALL":              all original columns are carried through to the
+#'                            ADaM, including those that have been renamed.
+#'   - "PREREQUISITE":     columns are kept if they are required for future
+#'                            derivations in the specification. For example, if
+#'                            a derivation references VSSTDTC despite this not
+#'                            being present in the ADaM specification, the column
+#'                            will be kept.
 #'
 #' @return dataset
 #' @export
 #' @importFrom stringr str_to_lower str_detect str_extract str_to_upper
-#'   str_split
+#'   str_split str_match_all
 #' @importFrom dplyr filter pull mutate group_by group_split inner_join select
 #'   full_join bind_rows
 #' @importFrom tidyr unnest
@@ -40,8 +46,30 @@
 #' spec <- metacore %>% select_dataset("ADSL")
 #' ds_list <- list(DM = read_xpt(metatools_example("dm.xpt")))
 #' build_from_derived(spec, ds_list, predecessor_only = FALSE)
+#'
+#' # Building an ADaM (ADVS) from multiple input datasets, keeping columns
+#' # needed for future transformations
+#' library(metacore)
+#' library(haven)
+#' library(magrittr)
+#' library(safetyData)
+#' load(metacore_example("pilot_ADaM.rda"))
+#' spec <- metacore %>% select_dataset("ADVS")
+#' ds_list <- list("VS" = safetyData::sdtm_vs,"ADSL" = safetyData::adam_adsl)
+#' build_from_derived(spec,
+#'                    ds_list,
+#'                    predecessor_only = FALSE,
+#'                    keep = "PREREQUISITE"
+#' )
+
 build_from_derived <- function(metacore, ds_list, dataset_name = NULL,
                                predecessor_only = TRUE, keep = FALSE) {
+   # Deprecate KEEP = TRUE
+   keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "PREREQUISITE"))
+   if (keep == "TRUE"){
+      warning("Setting 'keep' = TRUE has been superseded, and will be unavailable in future releases.
+   Please consider setting 'keep' equal to 'ALL' or 'PREREQUISITE'.")
+   }
    metacore <- make_lone_dataset(metacore, dataset_name)
    derirvations <- metacore$derivations %>%
       mutate(derivation = trimws(derivation))
@@ -60,6 +88,7 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL,
 
    vars_to_pull_through <- derirvations %>%
       filter(str_detect(derivation, "^\\w*\\.[a-zA-Z0-9]*$"))
+
    # To lower so it is flexible about how people name their ds list
    vars_w_ds <- vars_to_pull_through %>%
       mutate(ds = str_extract(derivation, "^\\w*(?=\\.)") %>%
@@ -123,12 +152,11 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL,
       bind_rows(additional_vals) %>%
       group_by(ds) %>%
       group_split() %>%
-      map(get_variables, ds_list, keep) %>%
+      map(get_variables, ds_list, keep, derirvations) %>%
+      prepare_join(join_by, names(ds_list)) %>%
       reduce(full_join, by = join_by)
 }
 
-
-
 #' Internal functions to get variables from a dataset list
 #'
 #' This function is used with `build_from_derived` to build a dataset of columns
@@ -140,22 +168,84 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL,
 #'
 #' @return datasets
 #' @noRd
-get_variables <- function(x, ds_list, keep) {
+get_variables <- function(x, ds_list, keep, derivations) {
    ds_name <- unique(x$ds)
    data <- ds_list[[ds_name]]
    rename_vec <- set_names(x$col_name, x$variable)
-   if (keep) {
+   if (keep == "TRUE") {
+      # Don't drop predecessor columns
       out <- data %>%
          select(x$col_name) %>%
          mutate(across(all_of(rename_vec)))
-   } else {
+   } else if (keep == "FALSE") {
+      # Drop predecessor columns
       out <- data %>%
          select(x$col_name) %>%
-         rename(all_of(rename_vec))
+         mutate(across(all_of(rename_vec))) %>%
+         select(x$variable)
+   } else if (keep == "ALL") {
+      # Keep all cols from original datasets
+      out <- data %>%
+         mutate(across(all_of(rename_vec)))
+   } else if (keep == "PREREQUISITE") {
+      # Keep all columns required for future derivations
+      # Find all "XX.XXXXX"
+      future_derivations <- derivations %>%
+         select(derivation) %>%
+         filter(!str_detect(derivation,"^[A-Z]+\\.[A-Z0-9a-z]+$"))
+
+      prereq_vector <- str_match_all(future_derivations$derivation, "([A-Z]+)\\.([A-Z0-9a-z]+)")
+
+      # Bind into matrix + remove dups
+      prereq_matrix <- do.call(rbind,prereq_vector) %>%
+         unique()
+
+      # Subset to those present in current dataset
+      prereq_cols <- subset(prereq_matrix, tolower(prereq_matrix[,2]) == tolower(ds_name))[,3]
+
+      out <- data %>%
+         select(c(x$col_name, all_of(prereq_cols))) %>%
+         mutate(across(all_of(rename_vec))) %>%
+         select(c(x$variable, all_of(prereq_cols)))
    }
    out
 }
 
+#' Internal function to remove duplicated non-key variables prior to join
+#'
+#' This function is used with `build_from_derived` to drop columns that would
+#' cause a conflict on joining datasets, prioritising keeping columns in
+#' datasets earlier on in ds_list
+#'
+#' @param x List of datasets with all columns added
+#' @param keys List of key values to join on
+#'
+#' @return datasets
+#' @noRd
+prepare_join <- function(x, keys, ds_names) {
+   out <- list(x[[1]])
+
+   if (length(x) > 1){
+      for (i in 2:length(x)){
+         # Drop non-key cols present in each previous dataset in order
+         drop_cols <- c()
+
+         for (j in 1:(i-1)){
+            conflicting_cols <- keep(names(x[[j]]), function(col) !(col %in% keys)) %>%
+               intersect(colnames(x[[i]]))
+            drop_cols <- c(drop_cols, conflicting_cols)
+
+            if(length(conflicting_cols) > 0){
+               message(paste0("Dropping column(s) from ", ds_names[[i]]," due to conflict with ",ds_names[[j]],": ", conflicting_cols,"."))
+            }
+         }
+
+         out[[i]] <- x[[i]] %>%
+            select(-any_of(drop_cols))
+      }
+   }
+   out
+}
 
 #' Drop Unspecified Variables
 #'

diff --git a/tests/testthat/test-build.R b/tests/testthat/test-build.R
@@ -28,7 +28,7 @@ test_that("build_from_derived", {
     pull(derivation) %>%
     str_remove("^DM\\.") %>%
     unique() %>%
-    ifelse(. == "ARM", "TRT01P", .) %>%
+    c("TRT01P") %>%
     sort()
   build_from_derived(spec, ds_list,
     predecessor_only = FALSE,
@@ -46,13 +46,16 @@ test_that("build_from_derived", {
     unique() %>%
     c(., "TRT01P") %>%
     sort()
-  build_from_derived(spec, ds_list,
-    predecessor_only = FALSE,
-    keep = TRUE
-  ) %>%
-    names() %>%
-    sort() %>%
-    expect_equal(man_vars)
+
+  expect_warning(
+     build_from_derived(spec, ds_list,
+       predecessor_only = FALSE,
+       keep = TRUE
+     ) %>%
+       names() %>%
+       sort() %>%
+       expect_equal(man_vars)
+  )
 
   # Pulling through from more than one dataset
   spec2 <- metacore %>% select_dataset("ADAE")
@@ -108,6 +111,50 @@ test_that("build_from_derived", {
                                     keep = FALSE
   ))
 
+  # Pulling through all columns from original dataset
+  adae_full <- build_from_derived(spec2,
+                                  ds_list = list("AE" = safetyData::sdtm_ae,
+                                                 "ADSL" = safetyData::adam_adsl),
+                                  predecessor_only = FALSE,
+                                  keep = "ALL"
+  )
+
+  full_adsl_part <- safetyData::adam_adsl %>%
+     mutate(TRTA = TRT01A, TRTAN = TRT01AN)
+
+  adae_all_man <- full_join(full_adsl_part, safetyData::sdtm_ae, by = c("STUDYID", "USUBJID"), multiple = "all")
+
+  expect_equal(adae_full,adae_all_man)
+
+  # Pulling through columns required for future derivations
+  spec3 <- metacore %>% select_dataset("ADVS")
+
+  advs_prereq <- build_from_derived(spec3,
+                                  ds_list = list("VS" = safetyData::sdtm_vs,
+                                                 "ADSL" = safetyData::adam_adsl),
+                                  predecessor_only = FALSE,
+                                  keep = "PREREQUISITE"
+  )
+
+  advs_auto <- build_from_derived(spec3,
+                                    ds_list = list("VS" = safetyData::sdtm_vs,
+                                                   "ADSL" = safetyData::adam_adsl),
+                                    predecessor_only = FALSE,
+                                    keep = "PREREQUISITE"
+  )
+
+
+  advs_all <- build_from_derived(spec3,
+                                 ds_list = list("VS" = safetyData::sdtm_vs,
+                                                "ADSL" = safetyData::adam_adsl),
+                                 predecessor_only = FALSE,
+                                 keep = "ALL"
+  )
+
+  advs_prereq_man <- advs_all %>%
+     select(c(names(advs_auto), VSDTC, VSSTRESN))
+
+  expect_equal(advs_prereq, advs_prereq_man)
 
 })