diff --git a/DESCRIPTION b/DESCRIPTION index 0470a61..93a2ae7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: PThelper Type: Package Title: The Systematic Cleaning and Processing of Purchase Task Data: An R Statistical Package -Version: 0.1.2 +Version: 0.1.5 Authors@R: c(person("Kyla","Belisario", role = c("aut", "cre", "trl"), email = "kyla_belisario@protonmail.com", comment = c(ORCID = "0000-0001-5403-9531")), person("Ben","Goodman", role = c("aut", "trl")), person("Meenu","Minhas", role = c("aut"), comment = c(ORCID = "0000-0001-6741-3940")), diff --git a/R/pt_qc.R b/R/pt_qc.R index 96305ed..390f876 100644 --- a/R/pt_qc.R +++ b/R/pt_qc.R @@ -1,22 +1,30 @@ #' PT QC #' -#' This function helps users to conduct quality control on purchase task data by -#' using the 3-Criterion Algorithm by Stein et al. (2015) to remove non-systematic data: -#' Specifically, this function identifies and removes IDs with: i) trend violations; -#' ii) excessive bounce ratios; and iii) excessive reversals in responses. +#' This function helps users to conduct quality control on purchase task data to remove non-systematic data. Specifically, this function +#' identifies and removes IDs with: i) trend violations; ii) excessive bounce ratios; and iii) excessive reversals in responses. Quality +#' control options follow the proposed 3-criterion method by Stein et al. (2015), but also allow for customization, as well as use of +#' another option for bounce criterion used at PBCAR. #' #' Stein, J. S., Koffarnus, M. N., Snider, S. E., Quisenberry, A. J., & Bickel, W. K. (2015). #' Identification and management of nonsystematic purchase task data: Toward best practice. #' Experimental and clinical psychopharmacology, 23(5), 377. #' -#' @param pt A data frame consisting of the `id_var` and purchase task variables. +#' @param pt A list consisting of two data frames: "data" which consists of the `id_var` and purchase task variables, and the "qc_data" which provides details on +#' the results of the quality control for all IDs. #' @param id_var The name of the unique identifier (ID) as identified in the data frame. -#' @param bounce_val Expressed as a proportion, the bounce value is used as a -#' threshold to identify excessive inconsistencies in responses. The default bounce -#' value is 0.1. IDs exceeding this bounce value are removed. -#' @param rev_n The number of acceptable reversals from zero. The default number is 0, -#' which removes IDs with 1 or more reversals from zero. -#' @param cons_n The number of consecutive zeroes to signify a reversal from zero using +#' @param delta_q The log10-unit reduction in consumption from the first to last price. The default is set to 0.025 (suggested by Stein et al., 2015), +#' with values lower than 0.025 deemed non-systematic. +#' @param bounce_type The type of bounce criterion applied, one of c("initial","p2p"). The default is "initial", which follows the criterion put forth by +#' Stein et al. (2015). This method defines what constistutes a `jump`, with all subsequent prices in the purchase task being compared to the consumption +#' at the first price. The "p2p" option is a proposed alternative, which considers any increases in consumption from one price to the next to be a `jump` +#' in consumption. Both methods make use of the `bounce_val` which will only exclude participants above a set threshold of jumps. +#' @param jump Expressed as a proportion, the jump value is the percent increase in consumption at first price used to define an excessive increase in +#' consumption. The default is 0.25 (25% as suggested by Stein et al., 2015), meaning that any consumption 25% higher than consumption at first price +#' would be considered an excessive jump in consumption. +#' @param bounce_val Expressed as a proportion, the bounce value is used as a threshold to identify excessive inconsistencies in responses. The +#' default bounce value is 0.1. IDs exceeding this bounce value are removed. +#' @param rev_n The number of acceptable reversals from zero, one of c(0,1). The default number is 0, meaning no reversals from zero are allowed. +#' @param cons_0 The number of consecutive zeroes to signify a reversal from zero, one of c(1,2). The default is 1. Previously named `cons_n`. #' the {beezdemand} package. The default number of consecutive zeroes is 2. #' @examples #' \dontrun{ @@ -28,59 +36,130 @@ #' } #' @export -pt_qc <- function(pt, id_var, bounce_val = 0.1, rev_n = 0, cons_n = 2){ +pt_qc <- function(pt, id_var, delta_q = 0.025, bounce_type = "initial", jump = 0.25, bounce_val = 0.1, rev_n = 0, cons_0 = 1){ - # WARNING: NA values should have been changed to 0 as outlined in the `pt_prep()` function + ### WARNING: NA values should have been changed to 0 as outlined in the `pt_prep()` function if(any(is.na(pt))) stop("IDs with missing values") pt_names <- names(pt) prices <- pt_names[pt_names!=id_var] names(pt)[names(pt) == id_var] <- "id" - # IDENTIFY & REMOVE IDs with a trend violation + ##### ----- IDENTIFY & REMOVE IDs with a trend violation + remove.id.trend = {} - for (id_num in pt$id){ - if ((pt[pt$id == id_num,prices[1]]>0) & - (pt[pt$id == id_num,prices[1]] <= pt[pt$id == id_num,prices[length(prices)]]) ){ - pt <- pt[!pt[,"id"] %in% c(id_num),] - remove.id.trend <- append(remove.id.trend,id_num) - } - } - # IDENTIFY & REMOVE IDs + pt$dq_1 <- as.numeric(pt[,prices[1]])+0.001 ### quantity at price 1 + pt$dq_n <- as.numeric(pt[,prices[length(prices)]])+0.001 ### quantity at price n + pt$dp_1 <- as.numeric(prices[1])+0.001 ### price 1 (intensity) + pt$dp_n <- as.numeric(prices[length(prices)])+0.001 ## price n + + ### FORMULA: FORMULA: deltaQ = (log10(quantity at price 1) - log10(quantity at price n)) / (log10(price n) - log10(price 1)) + ### allow those with zero consumption at price 1 to pass (by making delta_q the value considered systematic) + + pt$delta_q <- (log10(pt$dq_1)-log10(pt$dq_n))/(log10(pt$dp_n)-log10(pt$dp_1)) + pt$delta_q <- ifelse(pt[,prices[1]]==0,delta_q,pt$delta_q) + + remove.id.trend <- pt$id[pt$delta_q pt$first_cons) + + ### although it makes sense to divide by number of prices administered minus 1 (for initial value), + ### Stein et al. do not calculate bounce ratio this way + + pt$bounce_val <- pt$jumps/length(prices) + + remove.id.bounce <- pt$id[pt$bounce_val> bounce_val] + } + + if(bounce_type == "p2p"){ + + for (id_num in pt$id){ + num.bounces <- 0 + for (j in seq(1,length(prices)-1,1)){ + if (pt[pt$id == id_num,prices[j]] < pt[pt$id == id_num,prices[j+1]]){ + num.bounces <- num.bounces + 1 + } + } + if (num.bounces/(length(prices)-1) > bounce_val){ + pt <- pt[!pt[,"id"] %in% c(id_num),] + remove.id.bounce <- append(remove.id.bounce,id_num) } } - if (num.bounces/(length(prices)-1) > bounce_val){ - pt <- pt[!pt[,"id"] %in% c(id_num),] - remove.id.bounce <- append(remove.id.bounce,id_num) - } + } - # IDENTIFY & REMOVE IDs with reversals from zero - pt_long <- stats::reshape(as.data.frame(pt), idvar = "id", varying = prices, - v.names = "y", timevar = "x", sep = "", direction = "long") + ##### ----- REMOVE IDS WITH REVERSALS + + remove.id.reversal <- {} + + ### calculate number of zero-consumption responses present for each participant + pt$zero_count <- rowSums(pt[c(prices)]==0, na.rm = T) + + ## use instead of min function to surpress warning + minval <- function(x) {if (length(x)>0) min(x) else Inf} + + ### get breakpoint price + pt$bp <- apply(pt[ ,prices], 1, function(x) {names(x)[minval(which(x == 0))] }) - pt_long <- pt_long[order(pt_long$id),] - pt_long$x <- prices + ### get number of price items at and after bp + ### and determine if non-zero values exist between breakpoint and end price - check.unsys <- beezdemand::CheckUnsystematic(dat = pt_long, deltaq = -0.01, bounce = bounce_val, - reversals = rev_n, ncons0 = cons_n) + pt$bp_items <- sapply(pt$bp, function(x) sum(as.numeric(prices)>=as.numeric(x), na.rm = T)) + pt$postbp_val <- sapply(seq_along(pt[,1]), function(x) {pt[,prices][x, prices[(match(pt$bp,prices)+1)][x]]}) - remove.id.list <- check.unsys$ReversalsPass=="Fail" - remove.id.reversal <- check.unsys$id[check.unsys$ReversalsPass=="Fail"] - keep.id.list <- check.unsys$id[!remove.id.list] + pt$postbp_val[as.numeric(pt$bp)==max(as.numeric(prices)) | is.na(pt$bp)] <- "NO BP" + pt$postbp_val <- unlist(as.character(pt$postbp_val)) + pt$postbp_val[pt$postbp_val=="NO BP"] <- NA + pt$postbp_val <- as.numeric(pt$postbp_val) - pt_long2 <- pt_long[!is.na(match(pt_long$id,keep.id.list)),] + ## need to find the number of zeroes that exist after the first breakpoint (or minus the breakpoint) + ## this plus breakpoint will determine if there are additional reversals - pt_wide <- stats::reshape(as.data.frame(pt_long2), idvar = "id", v.names = "y", timevar = "x", direction = "wide") + pt$final0 <- apply(pt[,prices], 1, function(x) sum(cumsum(rev(x)) == 0)) - names(pt_wide) <- gsub("y.", "", names(pt_wide)) + ### THREE OPTIONS FOR REVERSALS FROM ZERO: + + ### i) No reversals allowed: + ### zero_count = bp_items + + if(rev_n==0){ + pt$reversals <- ifelse(pt$zero_count==pt$bp_items,FALSE,TRUE) + } + + ### ii) 1 reversal of a single 0 allowed: + ### zero_count = final0 + 1 + + if(rev_n==1 & cons_0==1){ + pt$reversals <- ifelse(pt$zero_count==pt$final0+1,FALSE,TRUE) + } + ### iii) 1 reversal of two consecutive 0s allowed: + ### zero_count = final0 + 1 + + if(rev_n==1 & cons_0==2){ + pt$reversals <- ifelse(pt$zero_count==pt$final0+2 & pt$postbp_val==0,FALSE,TRUE) + } + + remove.id.reversal <- pt$id[pt$reversals==TRUE] + + qc <- pt[c("id","delta_q","bounce_val","reversals")] + + remove.id <- c(remove.id.trend,remove.id.bounce,remove.id.reversal) + + pt <- pt[c("id",prices)] + pt <- pt[(!pt$id %in% remove.id),] + names(pt)[names(pt) == "id"] <- id_var + + pt_final <- list(data = as.data.frame(pt), qc_data = as.data.frame(qc)) if(length(remove.id.trend)==0) (remove.id.trend <- "NULL") if(length(remove.id.bounce)==0) (remove.id.bounce <- "NULL") @@ -90,6 +169,6 @@ pt_qc <- function(pt, id_var, bounce_val = 0.1, rev_n = 0, cons_n = 2){ "\n","IDs with a bounce violation: ", remove.id.bounce, "\n","IDs with a reversal violation: ", remove.id.reversal,"\n") - names(pt_wide)[names(pt_wide) == "id"] <- id_var - return(pt_wide) + return(pt_final) } + diff --git a/README.md b/README.md index 4a00fff..b1757f2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ [![R-CMD-check](https://github.com/PBCAR/PThelper/actions/workflows/r_check_standard.yml/badge.svg)](https://github.com/PBCAR/PThelper/actions/workflows/r_check_standard.yml) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5847710.svg)](https://doi.org/10.5281/zenodo.5847710) +## !! Updates for 2023 !! + +This package is currently undergoing a transformation in a few key ways prior to being submitted to CRAN. Firstly, the utility of the existing functions are being altered to provide additional information key in adequately documenting the processing for reproducibility. Secondly, the quality control function (`pt_qc()`) is being modified to accommodate different processing decisions, and to follow the Stein et al. (2015) 3 Criterion more strictly. Thirdly, additional functions will be added such as a summary function and a correlation function. + # The {PThelper} Package This package is designed to walk users through the various steps required to clean and process purchase task data. The benefit of this package is that it provides a standardized framework for processing purchase task data, improving reproducibility. @@ -43,7 +47,7 @@ Elasticity -- Measures sensitivity of consumption to increases in cost Purchase task processing requires preparation, and using the `price_prep()`, `pt_prep()`, and `pt_qc()` functions will ensure the proper cleaning of the data prior calculating the purchase task indicators. -The `plot_summary()` function can also be used at the end of the pre-processing stage to visualize the consumption and expenditure across the prices of the purchase task, prior to any outlier management. +The `plot_summary()` function can also be used at the end of the pre-processing stage to visualize the consumption and expenditure across the prices of the purchase task, prior to any outlier management. *ADDED:* Option to visualize individual consumption and expenditure using spaghetti plots using the `type` argument "individual". Another option to visualize by a grouping variable is available using the `type` argument "group". The `winsor_price()` function is an optional final pre-processing step which offers different outlier management techniques for the price-level data prior to curve-fitting and other purchase task processing. @@ -75,7 +79,17 @@ i) Trend violations -- Those who start with non-zero consumption and do not exhi ii) Excessive bounce ratios -- Bounce ratios identify inconsistencies in consumption values given. The default bounce ratio is 10%; and -iii) Excessive reversals in responses -- Those who exceed a user-defined number of reversals from a consumption of zero. The default defines a reversal as 2 or more consecutive zeroes and removes those with 1 or more reversals. +iii) Excessive reversals in responses -- Those who exceed a user-defined number of reversals from a consumption of zero. The default defines a reversal as 1 or more consecutive zeroes and removes those with 1 or more reversals. + +**Added in 2023 (v1.5.0):** + +a) The `delta_q` argument: This allows for the more strict trend violation as defined by Stein et al. (2015) to be applied. The previous trend violation removed anyone only with a delta Q value of 0. This new argument will remove anyone with a delta Q value below 0.025. The formula for delta Q is as follows: + +delta Q = (log10(quantity at price 1) - log10(quantity at price n)) / (log10(price n) - log10(price 1)) + +b) The ability to define bounce ratio calculation: Specifically, the previous method looked at price-to-price increases, with any increase in price Pn+1 compared to price Pn being flagged as a bounce ("jump") in consumption. This method has been retained, but the Stein et al. (2015) definition of a "jump" has been added. This method specifically defines a jump as any consumption values that are 25% higher (or other % as defined by the `jump` argument) than consumption at the lowest price. Both methods are equally valid for different reasons. + +c) Our own 'in-house' code to identify reversals from zeroes, as we previously utilized this functionality from the {beezdemand} package. #### f) Price-Level Winsorization (Optional): @@ -105,6 +119,8 @@ The fit of the elasticity curve by the `elasticity_curve()` function is done usi The overall sample curve is visualized, with the option to visualize each individual curve on the same plot (known as a spaghetti plot), identifying those with extreme sensitivity to price (i.e. high elasticity values; \> a z-score of 3). *NOTE:* The individual curves visualization can take time to render, especially with large data sets. +*ADDED:* Option to visualize the y-axis in either log-10 units or original units using the `y_type` argument. Options are "default" for log-10 units, or "original" for original consumption values. + ## iii) Final Processing of Purchase Task Indicators: Additional optional processing are available by using the `winsor_index()` and `plot_transform()` functions. These aid with outlier management and management of the distributional shape of the indicator-level purchase task variables. diff --git a/man/pt_qc.Rd b/man/pt_qc.Rd index 7fd5d18..8035c3d 100644 --- a/man/pt_qc.Rd +++ b/man/pt_qc.Rd @@ -4,28 +4,48 @@ \alias{pt_qc} \title{PT QC} \usage{ -pt_qc(pt, id_var, bounce_val = 0.1, rev_n = 0, cons_n = 2) +pt_qc( + pt, + id_var, + delta_q = 0.025, + bounce_type = "initial", + jump = 0.25, + bounce_val = 0.1, + rev_n = 0, + cons_0 = 1 +) } \arguments{ -\item{pt}{A data frame consisting of the `id_var` and purchase task variables.} +\item{pt}{A list consisting of two data frames: "data" which consists of the `id_var` and purchase task variables, and the "qc_data" which provides details on +the results of the quality control for all IDs.} \item{id_var}{The name of the unique identifier (ID) as identified in the data frame.} -\item{bounce_val}{Expressed as a proportion, the bounce value is used as a -threshold to identify excessive inconsistencies in responses. The default bounce -value is 0.1. IDs exceeding this bounce value are removed.} +\item{delta_q}{The log10-unit reduction in consumption from the first to last price. The default is set to 0.025 (suggested by Stein et al., 2015), +with values lower than 0.025 deemed non-systematic.} -\item{rev_n}{The number of acceptable reversals from zero. The default number is 0, -which removes IDs with 1 or more reversals from zero.} +\item{bounce_type}{The type of bounce criterion applied, one of c("initial","p2p"). The default is "initial", which follows the criterion put forth by +Stein et al. (2015). This method defines what constistutes a `jump`, with all subsequent prices in the purchase task being compared to the consumption +at the first price. The "p2p" option is a proposed alternative, which considers any increases in consumption from one price to the next to be a `jump` +in consumption. Both methods make use of the `bounce_val` which will only exclude participants above a set threshold of jumps.} -\item{cons_n}{The number of consecutive zeroes to signify a reversal from zero using +\item{jump}{Expressed as a proportion, the jump value is the percent increase in consumption at first price used to define an excessive increase in +consumption. The default is 0.25 (25% as suggested by Stein et al., 2015), meaning that any consumption 25% higher than consumption at first price +would be considered an excessive jump in consumption.} + +\item{bounce_val}{Expressed as a proportion, the bounce value is used as a threshold to identify excessive inconsistencies in responses. The +default bounce value is 0.1. IDs exceeding this bounce value are removed.} + +\item{rev_n}{The number of acceptable reversals from zero, one of c(0,1). The default number is 0, meaning no reversals from zero are allowed.} + +\item{cons_0}{The number of consecutive zeroes to signify a reversal from zero, one of c(1,2). The default is 1. Previously named `cons_n`. the {beezdemand} package. The default number of consecutive zeroes is 2.} } \description{ -This function helps users to conduct quality control on purchase task data by -using the 3-Criterion Algorithm by Stein et al. (2015) to remove non-systematic data: -Specifically, this function identifies and removes IDs with: i) trend violations; -ii) excessive bounce ratios; and iii) excessive reversals in responses. +This function helps users to conduct quality control on purchase task data to remove non-systematic data. Specifically, this function +identifies and removes IDs with: i) trend violations; ii) excessive bounce ratios; and iii) excessive reversals in responses. Quality +control options follow the proposed 3-criterion method by Stein et al. (2015), but also allow for customization, as well as use of +another option for bounce criterion used at PBCAR. } \details{ Stein, J. S., Koffarnus, M. N., Snider, S. E., Quisenberry, A. J., & Bickel, W. K. (2015).