Merge branch 'dev'

LudvigOlsen · Jan 3, 2023 · 07c7140 · 07c7140
2 parents b8d24b7 + 88518c9
commit 07c7140
Show file tree

Hide file tree

Showing 15 changed files with 193 additions and 103 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: cvms
 Title: Cross-Validation for Model Selection
-Version: 1.3.7.9000
+Version: 1.3.8
 Authors@R: 
     c(person(given = "Ludvig Renbo",
              family = "Olsen",
@@ -36,7 +36,7 @@ Imports:
     data.table (>= 1.12),
     dplyr (>= 0.8.5),
     ggplot2,
-    groupdata2 (>= 2.0.1),
+    groupdata2 (>= 2.0.2),
     lifecycle,
     lme4 (>= 1.1-23),
     MuMIn (>= 1.43.17),
@@ -73,4 +73,4 @@ RdMacros:
     lifecycle
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.2
+RoxygenNote: 7.2.3
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 
-# cvms 1.3.7.9000
+# cvms 1.3.8
+
+* In relevant contexts: Informs user *once* about the `positive` argument in `evaluate()` and `cross_validate*()` not affecting the interpretation of probabilities. I, myself, had forgotten about this in a project, so seems useful to remind us all about :-)
+
+* Fixes usage of the `"all"` name in `set_metrics()` after `purrr v1.0.0` update.
 
 # cvms 1.3.7
 

diff --git a/R/cross_validate.R b/R/cross_validate.R
@@ -62,28 +62,28 @@
 #'
 #'  Used when calculating confusion matrix metrics and creating \code{ROC} curves.
 #'
-#'  The \code{Positive Class} column in the output can be used to verify this setting.
+#'  The \code{Process} column in the output can be used to verify this setting.
 #'
 #'  N.B. Only affects evaluation metrics, not the model training or returned predictions.
 #'
 #'  N.B. \strong{Binomial models only}.
 #' @param metrics \code{list} for enabling/disabling metrics.
 #'
-#'   E.g. \code{list("RMSE" = FALSE)} would remove \code{RMSE} from the results,
-#'   and \code{list("Accuracy" = TRUE)} would add the regular \code{Accuracy} metric
-#'   to the classification results.
-#'   Default values (\code{TRUE}/\code{FALSE}) will be used for the remaining available metrics.
+#'  E.g. \code{list("RMSE" = FALSE)} would remove \code{RMSE} from the results,
+#'  and \code{list("Accuracy" = TRUE)} would add the regular \code{Accuracy} metric
+#'  to the classification results.
+#'  Default values (\code{TRUE}/\code{FALSE}) will be used for the remaining available metrics.
 #'
-#'   You can enable/disable all metrics at once by including
-#'   \code{"all" = TRUE/FALSE} in the \code{list}. This is done prior to enabling/disabling
-#'   individual metrics, why \code{list("all" = FALSE, "RMSE" = TRUE)}
-#'   would return only the \code{RMSE} metric.
+#'  You can enable/disable all metrics at once by including
+#'  \code{"all" = TRUE/FALSE} in the \code{list}. This is done prior to enabling/disabling
+#'  individual metrics, why \code{list("all" = FALSE, "RMSE" = TRUE)}
+#'  would return only the \code{RMSE} metric.
 #'
-#'   The \code{list} can be created with
-#'   \code{\link[cvms:gaussian_metrics]{gaussian_metrics()}} or
-#'   \code{\link[cvms:binomial_metrics]{binomial_metrics()}}.
+#'  The \code{list} can be created with
+#'  \code{\link[cvms:gaussian_metrics]{gaussian_metrics()}} or
+#'  \code{\link[cvms:binomial_metrics]{binomial_metrics()}}.
 #'
-#'   Also accepts the string \code{"all"}.
+#'  Also accepts the string \code{"all"}.
 #' @param preprocessing Name of preprocessing to apply.
 #'
 #'  Available preprocessings are:

diff --git a/R/cross_validate_fn.R b/R/cross_validate_fn.R
@@ -55,10 +55,15 @@
 #'
 #'  \subsection{Binomial}{
 #'  \code{vector} or one-column \code{matrix} / \code{data.frame} with probabilities (0-1)
-#'  of the second class, alphabetically.
+#'  \strong{of the second class, alphabetically}.
 #'  E.g.:
 #'
 #'  \code{c(0.3, 0.5, 0.1, 0.5)}
+#'
+#'  N.B. When unsure whether a model type produces probabilities based off
+#'  the alphabetic order of your classes, using 0 and 1 as classes in the
+#'  dependent variable instead of the class names should increase the chance of
+#'  getting probabilities of the right class.
 #'  }
 #'
 #'  \subsection{Gaussian}{

diff --git a/R/evaluate.R b/R/evaluate.R
@@ -788,15 +788,24 @@ run_evaluate <- function(data,
                "t be either numeric or character."))
     }
 
-    if (is.numeric(data[[prediction_cols]]) && (
-        max(data[[prediction_cols]]) > 1 ||
-        min(data[[prediction_cols]]) < 0)) {
-      assert_collection$push(
-        paste0(
-          "When 'type' is 'binomial' and 'data[[prediction_cols]]' ",
-          "is numeric, the values in 'data[[prediction_cols]]' must be b",
-          "etween 0 and 1."
-        ))
+    if (is.numeric(data[[prediction_cols]])){
+      if (max(data[[prediction_cols]]) > 1 ||
+          min(data[[prediction_cols]]) < 0) {
+        assert_collection$push(
+          paste0(
+            "When 'type' is 'binomial' and 'data[[prediction_cols]]' ",
+            "is numeric, the values in 'data[[prediction_cols]]' must be b",
+            "etween 0 and 1."
+          ))
+      }
+
+      # One may believe that setting the `positive` argument to the name
+      # of a class should mean that probabilities > `cutoff` would be
+      # considered that class, but this is not the case, so we
+      # make the user aware of this (once)
+      if (is.character(positive) && positive != sort(unique(as.character(data[[target_col]])))[[2]]){
+        inform_about_positive_no_effect_on_probs(positive=positive)
+      }
     }
     checkmate::reportAssertions(assert_collection)
 

diff --git a/R/evaluate_predictions_binomial.R b/R/evaluate_predictions_binomial.R
@@ -66,15 +66,6 @@ evaluate_predictions_binomial <- function(data,
       stop("The target column must maximally contain 2 levels.")
     }
 
-    # Create a column with the predicted class based on the chosen cutoff
-    # If it wasn't passed by parent function
-    if (is.null(predicted_class_col)) {
-      predicted_class_col <- create_tmp_name(data, "predicted_class")
-      data[[predicted_class_col]] <- ifelse(data[[prediction_col]] < model_specifics[["cutoff"]],
-        cat_levels[[1]], cat_levels[[2]]
-      )
-    }
-
     positive <- model_specifics[["positive"]]
     if (is.numeric(positive)) {
       positive <- cat_levels[positive]
@@ -85,6 +76,26 @@ evaluate_predictions_binomial <- function(data,
       ))
     }
 
+    # Create a column with the predicted class based on the chosen cutoff
+    # If it wasn't passed by parent function
+    if (is.null(predicted_class_col)) {
+
+      # One may believe that setting the `positive` argument to the name
+      # of a class should mean that probabilities > `cutoff` would be
+      # considered that class, but this is not the case, so we
+      # make the user aware of this (once)
+      if (is.character(model_specifics[["positive"]]) && positive != cat_levels[[2]]){
+        inform_about_positive_no_effect_on_probs(positive=positive)
+      }
+
+      predicted_class_col <- create_tmp_name(data, "predicted_class")
+      data[[predicted_class_col]] <- ifelse(data[[prediction_col]] < model_specifics[["cutoff"]],
+        cat_levels[[1]], cat_levels[[2]]
+      )
+    }
+
+
+
     # Nest predictions and targets
     # Will be NA if any model_was_null is TRUE and
     # include_predictions is TRUE

diff --git a/R/helpers.R b/R/helpers.R
@@ -676,8 +676,46 @@ create_message <- function(m, caller, formula = NULL, fold_col = NULL, fold = NU
 }
 
 
+# From tidyselect:
+# https://github.com/r-lib/tidyselect/blob/2fab83639982d37fd94914210f771ab9cbd36b4b/R/utils.R#L281
+# inform_once(c("Main message", list("bullet1", "bullet2")), id="some ID")
+inform_env <- rlang::env()
+inform_once <- function(msg, id = msg) {
+  stopifnot(rlang::is_string(id))
+
+  if (rlang::env_has(inform_env, id)) {
+    return(invisible(NULL))
+  }
+  inform_env[[id]] <- TRUE
+
+  issue <- msg[[1]]
+  bullets <- msg[-1]
+
+  msg <- issue
+  if (length(bullets)) {
+    bullets <- rlang::format_error_bullets(bullets)
+    msg <- paste_line(msg, bullets)
+  }
+
+  rlang::inform(paste_line(
+    msg, "< This message is displayed once per session. >"
+  ))
+}
+
+# From tidyselect
+paste_line <- function (...) {
+  paste(rlang::chr(...), collapse = "\n")
+}
 
 
+inform_about_positive_no_effect_on_probs <- function(positive){
+  inform_once(c(paste0("cvms::evaluate(type='binomial', positive='", positive, "', ):"), paste0(
+    "Please be aware that setting the `positive` argument ",
+    "does not change what the probabilities are of ",
+    "(second class alphabetically), only the confusion matrix-based metrics."
+  )), id="evaluate: The `positive` argument does not affect probabilities.")
+}
+
 #   __________________ #< 71c73c7cedb289ef6c3dd17503736847 ># __________________
 #   Convert to tibble                                                       ####
 

diff --git a/R/set_metrics.R b/R/set_metrics.R
@@ -164,7 +164,7 @@ set_metrics <- function(family, metrics_list = NULL, include_model_object_metric
         if (length(metrics_list) == 1) {
           metrics_list <- list()
         } else {
-          metrics_list <- metrics_list %>% purrr::list_modify("all" = NULL)
+          metrics_list <- metrics_list[names(metrics_list) != "all"]
         }
       }
 

diff --git a/man/cross_validate.Rd b/man/cross_validate.Rd
diff --git a/man/cross_validate_fn.Rd b/man/cross_validate_fn.Rd
diff --git a/man/validate.Rd b/man/validate.Rd
diff --git a/man/validate_fn.Rd b/man/validate_fn.Rd