-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Variable inclusion control #75
base: main
Are you sure you want to change the base?
Changes from 12 commits
20d2466
00d9b44
47a3e6f
a4fa1aa
d6db327
36eac1e
be2093e
9292ec0
3d90692
d644880
51a39a3
592533e
39b26e0
806d915
0caaf0e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,16 +16,22 @@ | |
#' @param predecessor_only By default `TRUE`, so only variables with the origin | ||
#' of 'Predecessor' will be used. If `FALSE` any derivation matching the | ||
#' dataset.variable will be used. | ||
#' @param keep Boolean to determine if the original columns should be kept. By | ||
#' default `FALSE`, so only the ADaM columns are kept. If `TRUE` the resulting | ||
#' dataset will have all the ADaM columns as well as any SDTM column that were | ||
#' renamed in the ADaM (i.e `ARM` and `TRT01P` will be in the resulting | ||
#' dataset) | ||
#' @param keep String to determine which columns from the original datasets | ||
#' should be kept | ||
#' - "FALSE" (default): only columns that are also present in the ADaM | ||
#' specification are kept in the output. | ||
#' - "ALL": all original columns are carried through to the | ||
#' ADaM, including those that have been renamed. | ||
#' - "PREREQUISITE": columns are kept if they are required for future | ||
#' derivations in the specification. For example, if | ||
#' a derivation references VSSTDTC despite this not | ||
#' being present in the ADaM specification, the column | ||
#' will be kept. | ||
#' | ||
#' @return dataset | ||
#' @export | ||
#' @importFrom stringr str_to_lower str_detect str_extract str_to_upper | ||
#' str_split | ||
#' str_split str_match_all | ||
#' @importFrom dplyr filter pull mutate group_by group_split inner_join select | ||
#' full_join bind_rows | ||
#' @importFrom tidyr unnest | ||
|
@@ -40,8 +46,30 @@ | |
#' spec <- metacore %>% select_dataset("ADSL") | ||
#' ds_list <- list(DM = read_xpt(metatools_example("dm.xpt"))) | ||
#' build_from_derived(spec, ds_list, predecessor_only = FALSE) | ||
#' | ||
#' # Building an ADaM (ADVS) from multiple input datasets, keeping columns | ||
#' # needed for future transformations | ||
#' library(metacore) | ||
#' library(haven) | ||
#' library(magrittr) | ||
#' library(safetyData) | ||
#' load(metacore_example("pilot_ADaM.rda")) | ||
#' spec <- metacore %>% select_dataset("ADVS") | ||
#' ds_list <- list("VS" = safetyData::sdtm_vs,"ADSL" = safetyData::adam_adsl) | ||
#' build_from_derived(spec, | ||
#' ds_list, | ||
#' predecessor_only = FALSE, | ||
#' keep = "PREREQUISITE" | ||
#' ) | ||
|
||
build_from_derived <- function(metacore, ds_list, dataset_name = NULL, | ||
predecessor_only = TRUE, keep = FALSE) { | ||
# Deprecate KEEP = TRUE | ||
keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "PREREQUISITE")) | ||
if (keep == "TRUE"){ | ||
warning("Setting 'keep' = TRUE has been superseded, and will be unavailable in future releases. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you switch this to a cli_alert_warning. At some point I am going to improve the error messaging in this package and change everything over to cli There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. have done so, and updated tests/NAMESPACE/DESCRIPTION to import CLI |
||
Please consider setting 'keep' equal to 'ALL' or 'PREREQUISITE'.") | ||
} | ||
metacore <- make_lone_dataset(metacore, dataset_name) | ||
derirvations <- metacore$derivations %>% | ||
mutate(derivation = trimws(derivation)) | ||
|
@@ -60,6 +88,7 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, | |
|
||
vars_to_pull_through <- derirvations %>% | ||
filter(str_detect(derivation, "^\\w*\\.[a-zA-Z0-9]*$")) | ||
|
||
# To lower so it is flexible about how people name their ds list | ||
vars_w_ds <- vars_to_pull_through %>% | ||
mutate(ds = str_extract(derivation, "^\\w*(?=\\.)") %>% | ||
|
@@ -123,12 +152,11 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, | |
bind_rows(additional_vals) %>% | ||
group_by(ds) %>% | ||
group_split() %>% | ||
map(get_variables, ds_list, keep) %>% | ||
map(get_variables, ds_list, keep, derirvations) %>% | ||
prepare_join(join_by, names(ds_list)) %>% | ||
reduce(full_join, by = join_by) | ||
} | ||
|
||
|
||
|
||
#' Internal functions to get variables from a dataset list | ||
#' | ||
#' This function is used with `build_from_derived` to build a dataset of columns | ||
|
@@ -140,22 +168,84 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, | |
#' | ||
#' @return datasets | ||
#' @noRd | ||
get_variables <- function(x, ds_list, keep) { | ||
get_variables <- function(x, ds_list, keep, derivations) { | ||
ds_name <- unique(x$ds) | ||
data <- ds_list[[ds_name]] | ||
rename_vec <- set_names(x$col_name, x$variable) | ||
if (keep) { | ||
if (keep == "TRUE") { | ||
# Don't drop predecessor columns | ||
out <- data %>% | ||
select(x$col_name) %>% | ||
mutate(across(all_of(rename_vec))) | ||
} else { | ||
} else if (keep == "FALSE") { | ||
# Drop predecessor columns | ||
out <- data %>% | ||
select(x$col_name) %>% | ||
rename(all_of(rename_vec)) | ||
mutate(across(all_of(rename_vec))) %>% | ||
select(x$variable) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the change to fix the issue with one predecessor to multiple columns - not sure if this would break anything, figure you probably understand how people might use it better, |
||
} else if (keep == "ALL") { | ||
# Keep all cols from original datasets | ||
out <- data %>% | ||
mutate(across(all_of(rename_vec))) | ||
} else if (keep == "PREREQUISITE") { | ||
# Keep all columns required for future derivations | ||
# Find all "XX.XXXXX" | ||
future_derivations <- derivations %>% | ||
select(derivation) %>% | ||
filter(!str_detect(derivation,"^[A-Z]+\\.[A-Z0-9a-z]+$")) | ||
MattBearham marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
prereq_vector <- str_match_all(future_derivations$derivation, "([A-Z]+)\\.([A-Z0-9a-z]+)") | ||
MattBearham marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Bind into matrix + remove dups | ||
prereq_matrix <- do.call(rbind,prereq_vector) %>% | ||
unique() | ||
|
||
# Subset to those present in current dataset | ||
prereq_cols <- subset(prereq_matrix, tolower(prereq_matrix[,2]) == tolower(ds_name))[,3] | ||
|
||
out <- data %>% | ||
select(c(x$col_name, all_of(prereq_cols))) %>% | ||
mutate(across(all_of(rename_vec))) %>% | ||
select(c(x$variable, all_of(prereq_cols))) | ||
} | ||
out | ||
} | ||
|
||
#' Internal function to remove duplicated non-key variables prior to join | ||
#' | ||
#' This function is used with `build_from_derived` to drop columns that would | ||
#' cause a conflict on joining datasets, prioritising keeping columns in | ||
#' datasets earlier on in ds_list | ||
#' | ||
#' @param x List of datasets with all columns added | ||
#' @param keys List of key values to join on | ||
#' | ||
#' @return datasets | ||
#' @noRd | ||
prepare_join <- function(x, keys, ds_names) { | ||
out <- list(x[[1]]) | ||
|
||
if (length(x) > 1){ | ||
for (i in 2:length(x)){ | ||
# Drop non-key cols present in each previous dataset in order | ||
drop_cols <- c() | ||
|
||
for (j in 1:(i-1)){ | ||
conflicting_cols <- keep(names(x[[j]]), function(col) !(col %in% keys)) %>% | ||
intersect(colnames(x[[i]])) | ||
drop_cols <- c(drop_cols, conflicting_cols) | ||
|
||
if(length(conflicting_cols) > 0){ | ||
message(paste0("Dropping column(s) from ", ds_names[[i]]," due to conflict with ",ds_names[[j]],": ", conflicting_cols,".")) | ||
MattBearham marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
|
||
out[[i]] <- x[[i]] %>% | ||
select(-any_of(drop_cols)) | ||
} | ||
} | ||
out | ||
} | ||
|
||
#' Drop Unspecified Variables | ||
#' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure I understand what this means. Because it seems to be both saying that if it is in the specification it will be kept even if it isn't the specifications... Also can you make sure to add in what the pattern is we use. So something like "Variables will be retained if they are either in the derivation column of the metacore object in the form of DATASET.VARIABLE. So in the case of "BMI = ADSL.HEIGHTBL/WEIGHTBL^2" both HEIGHTBL and WEIGHTBL will be retrained delisted not being a predecessor (this will need to be edited cause I can don't think that is the right bmi calculation)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have updated to be more specific and include examples of predecessors/prerequisites with definitions. Let me know if you think it's clearer!