Skip to content

Commit

Permalink
Merge pull request #19 from UBC-MDS/14_dummy_data_test
Browse files Browse the repository at this point in the history
14 dummy data test
  • Loading branch information
tzoght authored Jan 28, 2023
2 parents 2d46abc + b0bf307 commit 845f19d
Show file tree
Hide file tree
Showing 8 changed files with 162 additions and 34 deletions.
14 changes: 2 additions & 12 deletions R/redact_creditcardnumber.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,14 @@
#' redact_creditcardnumber(x)
redact_creditcardnumber <- function(string, hash_spotted=FALSE, replace_with="CREDITCARD") {

# to be implemented in the next milestone
print(string)
print(hash_spotted)
print(replace_with)

# regex pattern matches: Visa - Mastercard - Amex - Discover - Diners Club - JCB
regex <- "(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|(222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12}|(?:2131|1800|35\\d{3})\\d{11})"

# removes and white space or hyphens
clean_string <- stringr::str_replace_all(string, "[-\\s]", "")

if (hash_spotted) {
new_clean_text <- stringr::str_replace_all(clean_string, regex, function(x) openssl::md5(x[1]))
new_clean_text <- stringr::str_replace_all(string, regex, function(x) openssl::md5(x[1]))
return(new_clean_text)
} else {
new_clean_text <- stringr::str_replace_all(clean_string, regex, replace_with)
new_clean_text <- stringr::str_replace_all(string, regex, replace_with)
return(new_clean_text)
}
}


8 changes: 5 additions & 3 deletions R/redact_email.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@
#' redact_email(x)
#'
redact_email <- function(string, hash_spotted=FALSE, replace_with="EMAILADDRS") {
regex <- "\\b[a-z0-9!#$%&'*+/=?^_`{|}~-](?:[\\.a-z0-9!#$%&'*+/=?^_`{|}~-]{0,62}[a-z0-9!#$%&'*+/=?^_`{|}~-])?(?:@|\\sat\\s)[a-z0-9](?:[a-z0-9-]+(\\.|\\sdot\\s)(?:\\.|\\sdot\\s|[a-z0-9-]){0,251}[a-z0-9])+\\b"
#regex <- "\\b[a-z0-9!#$%&'*+/=?^_`{|}~-](?:[\\.a-z0-9!#$%&'*+/=?^_`{|}~-]{0,62}[a-z0-9!#$%&'*+/=?^_`{|}~-])?(?:@|\\sat\\s)[a-z0-9](?:[a-z0-9-]+(\\.|\\sdot\\s)(?:\\.|\\sdot\\s|[a-z0-9-]){0,251}[a-z0-9])+\\b"

regex <- "\\b[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-](?:[\\.a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]{0,62}[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-])?(?:@|\\sat\\s)[a-zA-Z0-9](?:(?=[a-zA-Z0-9-]*(\\.|\\sdot\\s))(?:\\.|\\sdot\\s|[a-zA-Z0-9-]){0,251}[a-zA-Z0-9])+\\b"

if (hash_spotted) {
new_text <- stringr::str_replace_all(string, regex, function(x) openssl::md5(x[1]))
return(new_text)
} else {
new_text <- gsub(regex, replace_with, string, ignore.case = TRUE)
new_text <- stringr::str_replace_all(string, regex, replace_with)
return(new_text)
}
}

21 changes: 21 additions & 0 deletions tests/testthat/data_with_pii.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
first_name,last_name,email_address,visa_cc,master_cc,balance,active_member,age
Jacob,King,the following is my email address JacobKing100@yahoo.com,this is my credit card: 4658481398602920,5339168719695860,100,1,24
Chloe,Lavoie,the following is my email address ChloeLavoie200@gmail.com,this is my credit card: 4532546510575280,5284482559079650,200,0,36
Myles,Clark,MylesClark300@hotmail.com,this is my credit card: 4539650939655290,5338287181016540,300,1,23
Daniel,Murray,DanielMurray400@outlook.ca,this is my credit card: 4716505160113470,5581255820397210,400,0,28
​Lucy,Landry,​LucyLandry500@ubc.ca,this is my credit card: 4716908400371550,5453813871212040,500,1,37
Austin,Cote,AustinCote600@yahoo.com,this is my credit card: 4716061284742460,5253980136053700,600,1,31
Leo,Leblanc,LeoLeblanc700@gmail.com,this is my credit card: 4959442428525640,this is my master card number: 5531588204138300,700,0,41
Luke,Cote,LukeCote800@hotmail.com,4716080523510600,5222993268333860,800,1,43
Chloe,Martin,ChloeMartin900@outlook.ca,4929012768643790,5298262095510460,900,0,58
Sophia,Taylor,SophiaTaylor1000@ubc.ca,4024007156452210,5301187452198840,1000,1,67
Sebastian,Li,SebastianLi1100@yahoo.com,4556769552517390,5236759291390490,1100,0,25
Theodore,Walker,TheodoreWalker1200@gmail.com,4929331453008610,5250776199472790,1200,1,29
Grayson,Moore,GraysonMoore1300@hotmail.com,4024007104267000,5453889807764760,1300,0,38
Madelyn,Ross,MadelynRoss1400@outlook.ca,4556869333621620,5495761075800590,1400,1,64
Charlie,Johnson,CharlieJohnson1500@ubc.ca,4539797776530150,5538805763318700,1500,0,66
Isaac,Davis,IsaacDavis1600@yahoo.com,4556132845135520,5126127814285380,1600,1,55
Grace,Thomas,GraceThomas1700@gmail.com,4485265068804380,5218882790545470,1700,0,43
Kayden,Thomas,KaydenThomas1800@hotmail.com,4532625426014480,5581447491954320,1800,1,48
Peyton,Bergeron,PeytonBergeron1900@outlook.ca,4532097151827700,5415089843570270,1900,0,58
Evelyn,Johnston,EvelynJohnston2000@ubc.ca,4485317522398220,5208126744511550,2000,0,29
21 changes: 21 additions & 0 deletions tests/testthat/data_with_pii_cleaned_hash.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
first_name,last_name,email_address,visa_cc,master_cc,balance,active_member,age
Jacob,King,the following is my email address d3ebf4160b78814ae2f942d621559ef8,this is my credit card: 0646e60e317ab6c3ca5ea7db720bb6d6,c077ea8331f1357b655546c0c6dd030c,100,1,24
Chloe,Lavoie,the following is my email address ccc33e19aed8144140e0e3b472d71d0c,this is my credit card: c23b451c4b9ab9bc3667fa482dab6594,1beee7a164fd2c76e8b4e588131d64c9,200,0,36
Myles,Clark,8a7ab41909ffabb0e5bd3e759e926550,this is my credit card: fcdf2584d0b985dd96c6acefdd80db8b,ff05ef02f05938f288220125c64d72e3,300,1,23
Daniel,Murray,e786525bbe5cdbd0e1b0723580c0be35,this is my credit card: 9122eab992e216247c9a44fb75968ee7,8c33e7d6d9f5b8b3e9f68b11416e9fe3,400,0,28
​Lucy,Landry,​66aa0f296907a96a0a15a64c54a0271c,this is my credit card: 9ff1b7835812f926eaf01509bf1388a7,2fe45c134eebaf4476d23fea7642824a,500,1,37
Austin,Cote,1aac2a4671c819e7ba448156eb2a945d,this is my credit card: 001d7261b6cfb156ec8e433a5634cde3,9e3172495cc711edec297ef8a5f095a5,600,1,31
Leo,Leblanc,b575f97818252f3ae9de320f38e6a26b,this is my credit card: 72bc1e81c9f91219fa42a34832329dc5,this is my master card number: f69a82acf9a08ad499e06f0750626e92,700,0,41
Luke,Cote,726ff593c99ae2734b61f7ba08cc0339,c68e0b958a085134e22585ac92c08e3c,6bb66b6d89abbbf0718f66720547ce57,800,1,43
Chloe,Martin,40b148d282dfa8d609ab18207188463a,22dabc63d739da192ef30a2bbcb06e61,a8f9b7c5b5e4c4b3f8a4d37d51078cac,900,0,58
Sophia,Taylor,a6c9c2015492ed280d2da2b94f3f37a4,444225bea558baa2a4c006f688853d1f,6ad1e9a84f5dbd02512391a07c001ed5,1000,1,67
Sebastian,Li,026dec7828d784304966748f8aac14e4,10fc94534572f34b028c61523f4605fe,d45be3c831abd474467bdd0b5d9e4dd6,1100,0,25
Theodore,Walker,0c89577b11f335687d51f6769baf809b,b28d6de54411d3884d8b24d55e285004,52ccea324a0c1a50a266a33283645042,1200,1,29
Grayson,Moore,4e7ca558fbc639c4cb24f8588c118b3d,ba76eb7fc1be6293e5feba89f5c7639f,39c34c75de06effa59da842781653c4a,1300,0,38
Madelyn,Ross,94cdc647d86ef5a8b54d5ff54b4c35b4,211acd46154d7a437dcc03b3ce46e5ce,bdefeaadfd160b9ff82eb33bd2726b1d,1400,1,64
Charlie,Johnson,3c235fdb2aa343a247c0f51ceda5eabe,ba97f2ad35d4d9f6fe00ed50e2762327,9fe8835d0d95deccf7dfd99c02c9c294,1500,0,66
Isaac,Davis,ffd1f67fff8581d26db24b85ce1d479a,53309f3853ff954ef7ed621b38501e28,e2737b551ebe7a0f4842f3f11bc2aa87,1600,1,55
Grace,Thomas,8d54befa39a3f13bea178f38a8fc67de,99a0625ae373ff242d7ed9c76930b836,8aba9728ea64663867b50a17c10bf729,1700,0,43
Kayden,Thomas,e71770b14ccf5aa8587750c5c5318f4a,779c725caf15e67c16b59536eaa5b862,92610a6913a995c2d9f5e08bfcd6c105,1800,1,48
Peyton,Bergeron,b7299528a41c8f5baf74ecc541b7aa4e,060783327b0e977a61614fa2129a7328,68321411ad37a3dccabe7902620ef7d0,1900,0,58
Evelyn,Johnston,95473fc56071e41d16b3b769a07d17ad,a22af5a670e749c4e8529a840088c372,83d71d2e14d5de862d8bcd28c23c5417,2000,0,29
21 changes: 21 additions & 0 deletions tests/testthat/data_with_pii_cleaned_non_hash.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
first_name,last_name,email_address,visa_cc,master_cc,balance,active_member,age
Jacob,King,the following is my email address EMAILADDRS,this is my credit card: CREDITCARD,CREDITCARD,100,1,24
Chloe,Lavoie,the following is my email address EMAILADDRS,this is my credit card: CREDITCARD,CREDITCARD,200,0,36
Myles,Clark,EMAILADDRS,this is my credit card: CREDITCARD,CREDITCARD,300,1,23
Daniel,Murray,EMAILADDRS,this is my credit card: CREDITCARD,CREDITCARD,400,0,28
​Lucy,Landry,​EMAILADDRS,this is my credit card: CREDITCARD,CREDITCARD,500,1,37
Austin,Cote,EMAILADDRS,this is my credit card: CREDITCARD,CREDITCARD,600,1,31
Leo,Leblanc,EMAILADDRS,this is my credit card: CREDITCARD,this is my master card number: CREDITCARD,700,0,41
Luke,Cote,EMAILADDRS,CREDITCARD,CREDITCARD,800,1,43
Chloe,Martin,EMAILADDRS,CREDITCARD,CREDITCARD,900,0,58
Sophia,Taylor,EMAILADDRS,CREDITCARD,CREDITCARD,1000,1,67
Sebastian,Li,EMAILADDRS,CREDITCARD,CREDITCARD,1100,0,25
Theodore,Walker,EMAILADDRS,CREDITCARD,CREDITCARD,1200,1,29
Grayson,Moore,EMAILADDRS,CREDITCARD,CREDITCARD,1300,0,38
Madelyn,Ross,EMAILADDRS,CREDITCARD,CREDITCARD,1400,1,64
Charlie,Johnson,EMAILADDRS,CREDITCARD,CREDITCARD,1500,0,66
Isaac,Davis,EMAILADDRS,CREDITCARD,CREDITCARD,1600,1,55
Grace,Thomas,EMAILADDRS,CREDITCARD,CREDITCARD,1700,0,43
Kayden,Thomas,EMAILADDRS,CREDITCARD,CREDITCARD,1800,1,48
Peyton,Bergeron,EMAILADDRS,CREDITCARD,CREDITCARD,1900,0,58
Evelyn,Johnston,EMAILADDRS,CREDITCARD,CREDITCARD,2000,0,29
21 changes: 21 additions & 0 deletions tests/testthat/data_without_pii.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
first_name,last_name,balance,active_member,age
Jacob,King,100,1,24
Chloe,Lavoie,200,0,36
Myles,Clark,300,1,23
Daniel,Murray,400,0,28
​Lucy,Landry,500,1,37
Austin,Cote,600,1,31
Leo,Leblanc,700,0,41
Luke,Cote,800,1,43
Chloe,Martin,900,0,58
Sophia,Taylor,1000,1,67
Sebastian,Li,1100,0,25
Theodore,Walker,1200,1,29
Grayson,Moore,1300,0,38
Madelyn,Ross,1400,1,64
Charlie,Johnson,1500,0,66
Isaac,Davis,1600,1,55
Grace,Thomas,1700,0,43
Kayden,Thomas,1800,1,48
Peyton,Bergeron,1900,0,58
Evelyn,Johnston,2000,0,29
52 changes: 52 additions & 0 deletions tests/testthat/test-clean_data_frame_dummy_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
df_with_pii <- data.frame(read.csv(file = 'data_with_pii.csv'))
df_without_pii <- read.csv(file = 'data_without_pii.csv')
df_with_pii_cleaned_hash <- read.csv(file = 'data_with_pii_cleaned_hash.csv')
df_with_pii_cleaned_non_hash <- read.csv(file = 'data_with_pii_cleaned_non_hash.csv')


# Create Spotters (similar to test-clean_data_frame.R)

spotter_1_r <- list(redact_email,FALSE,"EMAILADDRS")
spotter_2_r <- list(redact_creditcardnumber,FALSE,"CREDITCARD")
spotters_redacted <- list(spotter_2_r,spotter_1_r)

spotter_1_h <- list(redact_email,TRUE,0)
spotter_2_h <- list(redact_creditcardnumber,TRUE,0)
spotters_hashed <- list(spotter_2_h,spotter_1_h)


# test suite 1: check if column names is correct)
test_that("Column name checking", {
expect_true("first_name" %in% names(df_with_pii))
expect_true("last_name" %in% names(df_with_pii))
expect_true("email_address" %in% names(df_with_pii))
expect_true("visa_cc" %in% names(df_with_pii))
expect_true("master_cc" %in% names(df_with_pii))

expect_true("first_name" %in% names(df_without_pii))
expect_true("last_name" %in% names(df_without_pii))

expect_true("first_name" %in% names(df_with_pii_cleaned_hash))
expect_true("last_name" %in% names(df_with_pii_cleaned_hash))
expect_true("email_address" %in% names(df_with_pii_cleaned_hash))
expect_true("visa_cc" %in% names(df_with_pii_cleaned_hash))
expect_true("master_cc" %in% names(df_with_pii_cleaned_hash))

expect_true("first_name" %in% names(df_with_pii_cleaned_non_hash))
expect_true("last_name" %in% names(df_with_pii_cleaned_non_hash))
expect_true("email_address" %in% names(df_with_pii_cleaned_non_hash))
expect_true("visa_cc" %in% names(df_with_pii_cleaned_non_hash))
expect_true("master_cc" %in% names(df_with_pii_cleaned_non_hash))
})

# test suite 2: clean both
test_that("clean_data_frame: email & credit card hashed and redacted", {
expect_equal(clean_data_frame(df_with_pii,spotters_hashed),df_with_pii_cleaned_hash)
expect_equal(clean_data_frame(df_with_pii,spotters_redacted),df_with_pii_cleaned_non_hash)
})

# test suite 3: clean nothing
test_that("clean_data_frame: dataframe without PII should be the same as before", {
expect_equal(clean_data_frame(df_without_pii,spotters_hashed),df_without_pii)
expect_equal(clean_data_frame(df_without_pii,spotters_redacted),df_without_pii)
})
38 changes: 19 additions & 19 deletions tests/testthat/test-redact_creditcardnumber.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,34 +8,34 @@ fake_cc <- c("VISA, 4556129404313766",
)

# cc replace hash
fake_cc_hashes <- c('VISA,e93723ee0d38e30a68902aef6b0033de',
'MASTERCARD,26f6a9e75c5e837215449790c3edb9f5',
'AMEX,302e590b9f7052140eedde5f8b9e6d91',
'DINERS,177b6b9ce5f4e37dbfefd957841485e6',
'DISCOVER,60a88fe71767c452194c2168033506c1',
'JCB,b5906f02d7fd3684279f4fabc0ac9ec1')
fake_cc_hashes <- c('VISA, e93723ee0d38e30a68902aef6b0033de',
'MASTERCARD, 26f6a9e75c5e837215449790c3edb9f5',
'AMEX, 302e590b9f7052140eedde5f8b9e6d91',
'DINERS, 177b6b9ce5f4e37dbfefd957841485e6',
'DISCOVER, 60a88fe71767c452194c2168033506c1',
'JCB, b5906f02d7fd3684279f4fabc0ac9ec1')

# cc replace fixed string
fake_cc_fixed <- c("VISA,CREDITCARD", "MASTERCARD,CREDITCARD", "AMEX,CREDITCARD",
"DINERS,CREDITCARD", "DISCOVER,CREDITCARD","JCB,CREDITCARD" )
fake_cc_fixed <- c("VISA, CREDITCARD", "MASTERCARD, CREDITCARD", "AMEX, CREDITCARD",
"DINERS, CREDITCARD", "DISCOVER, CREDITCARD","JCB, CREDITCARD" )


# cc with hyphens and spaces
test_cc <- c("VISA, 4916-3637-6958-7210, 10/2023, 992",
"VISA, 4916 3637 6958 7210, 10/2023, 992")
# cc
test_cc <- c("VISA, 4916363769587210, 10/2023, 992",
"VISA, 4916363769587210, 10/2023, 992")

# cc with hyphens and spaces cleaned - fixed string
test_cc_clean <- c("VISA,CREDITCARD,10/2023,992",
"VISA,CREDITCARD,10/2023,992")
# cc cleaned - fixed string
test_cc_clean <- c("VISA, CREDITCARD, 10/2023, 992",
"VISA, CREDITCARD, 10/2023, 992")

# cc with hyphens and spaces cleaned - hashed
test_cc_clean_hash <- c("VISA,d45a171d816f68107a1af7c3ee4950f6,10/2023,992",
"VISA,d45a171d816f68107a1af7c3ee4950f6,10/2023,992")
# cc cleaned - hashed
test_cc_clean_hash <- c("VISA, d45a171d816f68107a1af7c3ee4950f6, 10/2023, 992",
"VISA, d45a171d816f68107a1af7c3ee4950f6, 10/2023, 992")

# strings of equal and random length to credit cards
number_string <- c("5628404238239405, 5673289472024660, 8709, 356785, 1111111111111111, 329481-2345-58901")
number_string <- c("5628404238239405, 5673289472024660, 8709, 356785, 1111111111111111, 329481234558901")

number_string_clean <- c('5628404238239405,5673289472024660,8709,356785,1111111111111111,329481234558901')
number_string_clean <- c('5628404238239405, 5673289472024660, 8709, 356785, 1111111111111111, 329481234558901')


# test case 1: replace credit card number with fixed string
Expand Down

0 comments on commit 845f19d

Please sign in to comment.