-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
GitHub issue #75. Temporary fix to detect Canadian Social Insurance N…
…umbers.
- Loading branch information
Showing
7 changed files
with
133 additions
and
15 deletions.
There are no files selected for viewing
Binary file not shown.
2 changes: 1 addition & 1 deletion
2
sample_projects/database_discovery/DataDiscovery.sh
100644 → 100755
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
java -jar DataDefender.jar database-discovery -d | ||
java -jar DataDefender.jar database-discovery -d |
19 changes: 10 additions & 9 deletions
19
sample_projects/database_discovery/datadiscovery.properties
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,12 @@ | ||
probability_threshold=0.55 | ||
english_tokens=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-token.bin | ||
name=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-person.bin | ||
location=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-location.bin | ||
date=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-date.bin | ||
money=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-money.bin | ||
organization=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-organization.bin | ||
time=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-time.bin | ||
names=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/names.dict | ||
english_tokens=en-token.bin | ||
name=en-ner-person.bin | ||
location=en-ner-location.bin | ||
date=en-ner-date.bin | ||
money=en-ner-money.bin | ||
organization=en-ner-organization.bin | ||
time=en-ner-time.bin | ||
names=names.dict | ||
limit=1000 | ||
models=name,location,date,time,money | ||
#models=name,location,date,time,money | ||
models=name |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
43 changes: 43 additions & 0 deletions
43
src/main/java/com/strider/datadefender/specialcase/SinDetector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
/** | ||
* Copyright 2014-2017, Armenak Grigoryan, and individual contributors as indicated | ||
* by the @authors tag. See the copyright.txt in the distribution for a | ||
* full listing of individual contributors. | ||
* | ||
* This is free software; you can redistribute it and/or modify it | ||
* under the terms of the GNU Lesser General Public License as | ||
* published by the Free Software Foundation; either version 2.1 of | ||
* the License, or (at your option) any later version. | ||
* | ||
* This software is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
*/ | ||
package com.strider.datadefender.specialcase; | ||
|
||
import com.strider.datadefender.database.metadata.MatchMetaData; | ||
import com.strider.datadefender.extensions.BiographicFunctions; | ||
import org.apache.log4j.Logger; | ||
import static org.apache.log4j.Logger.getLogger; | ||
|
||
/** | ||
* | ||
* @author strider | ||
*/ | ||
public class SinDetector { | ||
private static final Logger log = getLogger(SinDetector.class); | ||
|
||
public static MatchMetaData detectSin(MatchMetaData data, String text) { | ||
if (data.getColumnType().equals("INT") || data.getColumnType().equals("VARCHAR")) { | ||
BiographicFunctions bf = new BiographicFunctions(); | ||
if ( ( text.matches("[0-9]+") && text.length() == 9) && bf.isValidSIN(text)) { | ||
log.info("Valid SIN " + text); | ||
data.setModel("sin"); | ||
data.setAverageProbability(1); | ||
return data; | ||
} | ||
} | ||
return null; | ||
} | ||
} |
57 changes: 57 additions & 0 deletions
57
src/main/java/com/strider/datadefender/traindatagen/SINGenerator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
/* | ||
* | ||
* Copyright 2014-2017, Armenak Grigoryan, and individual contributors as indicated | ||
* by the @authors tag. See the copyright.txt in the distribution for a | ||
* full listing of individual contributors. | ||
* | ||
* This is free software; you can redistribute it and/or modify it | ||
* under the terms of the GNU Lesser General Public License as | ||
* published by the Free Software Foundation; either version 2.1 of | ||
* the License, or (at your option) any later version. | ||
* | ||
* This software is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
*/ | ||
package com.strider.datadefender.traindatagen; | ||
|
||
import com.strider.datadefender.extensions.BiographicFunctions; | ||
import com.strider.datadefender.functions.CoreFunctions; | ||
import java.io.PrintWriter; | ||
|
||
/** | ||
* Generate valid list of social insurance numbers and random text in a format | ||
* acceptable for OpenNLP data model trainer. | ||
* | ||
* @author Armenak Grigoryan | ||
*/ | ||
public class SINGenerator { | ||
|
||
private static final String START_TAG = "<START:sin>"; | ||
private static final String END_TAG = "<END>"; | ||
private static final String SPACE = " "; | ||
private static final int LINES = 20000; | ||
private static final String RANDOM_FILE = "/Users/strider/work/strider/DataDefender/src/main/resources/lipsum.txt"; | ||
private static final String OUTOUT_FILE = "/Users/strider/work/strider/DataDefender/src/main/resources/sin.txt"; | ||
|
||
|
||
//<START:medicine> Augmentin-Duo <END> is a penicillin antibiotic that contains two medicines - <START:medicine> amoxicillin trihydrate <END> | ||
// <START:medicine> potassium clavulanate <END>. They work together to kill certain types of bacteria and are used to treat certain types of bacterial infections | ||
public static void main(String[] args) throws Exception { | ||
|
||
BiographicFunctions bf = new BiographicFunctions(); | ||
CoreFunctions cf = new CoreFunctions(); | ||
PrintWriter writer = new PrintWriter(OUTOUT_FILE, "UTF-8"); | ||
|
||
for (int i=0; i<=LINES; i++) { | ||
StringBuilder sb = new StringBuilder(); | ||
sb.append(START_TAG).append(SPACE).append(bf.randomStringSIN()).append(SPACE).append(END_TAG).append(SPACE).append(cf.randomStringFromFile(RANDOM_FILE)).append(SPACE); | ||
sb.append(START_TAG).append(SPACE).append(bf.randomStringSIN()).append(SPACE).append(END_TAG).append(SPACE).append(cf.randomStringFromFile(RANDOM_FILE)); | ||
writer.println(sb.toString()); | ||
} | ||
writer.flush(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters