Merge pull request #114 from IQSS/dev

Now at CRAN 0.3.10! https://CRAN.R-project.org/package=dataverse
IQSS · Jan 13, 2022 · d496001 · d496001
2 parents 4775a92 + f696f74
commit d496001
Show file tree

Hide file tree

Showing 15 changed files with 280 additions and 162 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -23,3 +23,4 @@ man-roxygen/*
 ^CRAN-RELEASE$
 ^\.github$
 rhub-checks
+/Untitled.+\.R$
diff --git a/.github/workflows/R-CMD-check-daily.yaml b/.github/workflows/R-CMD-check-daily.yaml
@@ -2,7 +2,7 @@
 # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
 on:
   schedule:
-    - cron: "20 3 * * *" # Run every morning at 3:20am UTC 
+    - cron: "20 3 * * *" # Run every morning at 3:20am UTC
     # - cron: "7 1 * * *" # Run every morning at 1:07am UTC (~8pm central)
 
 name: R-CMD-check-daily
@@ -14,7 +14,7 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
       - uses: actions/checkout@v2
-      - uses: r-lib/actions/setup-r@v1
+      - uses: r-lib/actions/setup-r@v2
       - uses: r-lib/actions/setup-pandoc@master
 
       - name: Query dependencies

diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,7 @@
 
 * Add progress bar for all downloads (#108)
 * Minor documentation improvements (#64, #107)
+* Faster method for detecting ingest (#113) and robustness to ingested files without a metadata file due to errors (#80)
 * No longer relies on `foreign` (#34)
 
 # CHANGES in dataverse 0.3.9

diff --git a/R/get_dataframe.R b/R/get_dataframe.R
@@ -19,8 +19,9 @@
 #'  file, then `.f` should be `readRDS` or `readr::read_rds`. It can be a custom
 #'  function defined by the user. See examples for details.
 #'
-#' @param original A logical, defaulting to `TRUE`. Whether to read the ingested,
-#' archival version of the datafile if one exists. The archival versions are tab-delimited
+#' @param original A logical, whether to read the ingested,
+#' archival version of the datafile if one exists. If `TRUE`, users should supply
+#' a function to use to read in the original. The archival versions are tab-delimited
 #' `.tab` files so if `original = FALSE`, `.f` is set to `readr::read_tsv`.
 #'
 #' @inheritDotParams get_file
@@ -90,8 +91,8 @@
 #' writeBin(as_binary, path(temp, "county.RData"))
 #' load(path(temp, "county.RData"))
 #'
-#' If you are certain each RData contains only one object, one could define a custom
-#' custom function used in https://stackoverflow.com/a/34926943
+#' # If you are certain each RData contains only one object, one could define a
+#' # custom function used in https://stackoverflow.com/a/34926943
 #' load_object <- function(file) {
 #'   tmp <- new.env()
 #'   load(file = file, envir = tmp)
@@ -132,6 +133,7 @@ get_dataframe_by_id <- function(
   # if not ingested, then whether to take the original is not relevant.
   ingested <- is_ingested(fileid, ...)
 
+
   if (isFALSE(ingested)) {
     original <- NA
   }

diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R
@@ -5,7 +5,9 @@
 #' no ingested version, is set to NA. Note in `get_dataframe_*`,
 #' `original` is set to FALSE by default. Either can be changed.
 #' @param fileid A numeric ID internally used for `get_file_by_id`. Can be a vector for multiple files.
-#' @param progress Whether to show a progress bar of the download. Defaults to `FALSE`.
+#' @param progress Whether to show a progress bar of the download.
+#'   If not specified, will be set to `TRUE` for a file larger than 100MB. To fix
+#'   a value, set `FALSE` or `TRUE`.
 #'
 #' @export
 get_file_by_id <- function(
@@ -14,7 +16,7 @@ get_file_by_id <- function(
   format          = c("original", "bundle"),
   vars            = NULL,
   original        = TRUE,
-  progress        = FALSE,
+  progress        = NULL,
   key             = Sys.getenv("DATAVERSE_KEY"),
   server          = Sys.getenv("DATAVERSE_SERVER"),
   ...
@@ -44,25 +46,37 @@ get_file_by_id <- function(
     }
 
     # ping get_file_metadata to see if file is ingested
-    is_ingested <- is_ingested(fileid, server = server)
+    ingested <- is_ingested(fileid, server = server, key = key)
+
+    # if progress = NULL, determine progress by size
+    if (is.null(progress)) {
+      bytesize <- get_filesize(fileid, server = server, key = key)
+      if (isTRUE(bytesize > 1e8)) {
+        progress <- TRUE
+      } else {
+        progress <- FALSE
+      }
+    }
 
     # update archival if not specified
-    if (isFALSE(is_ingested))
+    if (isFALSE(ingested))
       original <- NA
 
     # create query -----
     query <- list()
+
+    # variables
     if (!is.null(vars))
       query$vars <- paste0(vars, collapse = ",")
 
     # format only matters in ingested datasets,
-    # For non-ingested files (rds/docx), we need to NOT specify a format
+    # For non-ingested files (e.g. rds/docx), we need to NOT specify a format
     # also for bundle, only change url
-    if (is_ingested & format != "bundle")
+    if (ingested & format != "bundle")
       query$format <- match.arg(format)
 
     # if the original is not desired, we need to NOT specify a format
-    if (is_ingested & (isFALSE(original) || is.na(original) || is.null(original)))
+    if (ingested & (isFALSE(original) || is.na(original) || is.null(original)))
       query$format <- NULL
 
     # part of URL depending on DOI, bundle, or file