Skip to content

Commit

Permalink
GitHub issue #75. Temporary fix to detect Canadian Social Insurance N…
Browse files Browse the repository at this point in the history
…umbers.
  • Loading branch information
armenak committed Dec 27, 2016
1 parent 1678d9f commit 0b76d55
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 15 deletions.
Binary file not shown.
2 changes: 1 addition & 1 deletion sample_projects/database_discovery/DataDiscovery.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1 +1 @@
java -jar DataDefender.jar database-discovery -d
java -jar DataDefender.jar database-discovery -d
19 changes: 10 additions & 9 deletions sample_projects/database_discovery/datadiscovery.properties
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
probability_threshold=0.55
english_tokens=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-token.bin
name=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-person.bin
location=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-location.bin
date=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-date.bin
money=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-money.bin
organization=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-organization.bin
time=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-time.bin
names=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/names.dict
english_tokens=en-token.bin
name=en-ner-person.bin
location=en-ner-location.bin
date=en-ner-date.bin
money=en-ner-money.bin
organization=en-ner-organization.bin
time=en-ner-time.bin
names=names.dict
limit=1000
models=name,location,date,time,money
#models=name,location,date,time,money
models=name
24 changes: 20 additions & 4 deletions src/main/java/com/strider/datadefender/DatabaseDiscoverer.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
import com.strider.datadefender.database.metadata.IMetaData;
import com.strider.datadefender.database.metadata.MatchMetaData;
import com.strider.datadefender.database.sqlbuilder.ISQLBuilder;
import com.strider.datadefender.extensions.BiographicFunctions;
import com.strider.datadefender.specialcase.SinDetector;
import com.strider.datadefender.utils.CommonUtils;
import java.text.DateFormat;
import java.text.ParseException;
Expand Down Expand Up @@ -101,7 +103,9 @@ private List<MatchMetaData> discoverAgainstSingleModel(final IDBFactory factory,
final List<MatchMetaData> map = metaData.getMetaData();
// Start running NLP algorithms for each column and collect percentage
matches = new ArrayList<>();

MatchMetaData specialCaseData = null;
boolean specialCase = true;

final ISQLBuilder sqlBuilder = factory.createSQLBuilder();
List<Double> probabilityList;
for(final MatchMetaData data: map) {
Expand Down Expand Up @@ -147,17 +151,21 @@ private List<MatchMetaData> discoverAgainstSingleModel(final IDBFactory factory,
}

final String sentence = resultSet.getString(1);

if (specialCase) {
specialCaseData = SinDetector.detectSin(data, sentence);
}

if (sentence != null && !sentence.isEmpty()) {

String processingValue;
String processingValue = "";
if (data.getColumnType().equals("DATE") ||
data.getColumnType().equals("TIMESTAMP") ||
data.getColumnType().equals("DATETIME")
) {
final DateFormat originalFormat = new SimpleDateFormat(sentence, Locale.ENGLISH);
final DateFormat targetFormat = new SimpleDateFormat("MMM d, yy");
final java.util.Date date = originalFormat.parse(sentence);
processingValue = targetFormat.format(date);
processingValue = targetFormat.format(date);
} else {
processingValue = sentence;
}
Expand Down Expand Up @@ -197,6 +205,14 @@ private List<MatchMetaData> discoverAgainstSingleModel(final IDBFactory factory,
data.setModel(model.getName());
matches.add(data);
}

// Special processing
log.info("specialCaseData is null " + (specialCaseData == null));
log.info("specialCase is true" + specialCase);
if (specialCase && specialCaseData != null) {
log.info(specialCaseData.getModel());
matches.add(specialCaseData);
}
}

return matches;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/**
* Copyright 2014-2017, Armenak Grigoryan, and individual contributors as indicated
* by the @authors tag. See the copyright.txt in the distribution for a
* full listing of individual contributors.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*/
package com.strider.datadefender.specialcase;

import com.strider.datadefender.database.metadata.MatchMetaData;
import com.strider.datadefender.extensions.BiographicFunctions;
import org.apache.log4j.Logger;
import static org.apache.log4j.Logger.getLogger;

/**
*
* @author strider
*/
public class SinDetector {
private static final Logger log = getLogger(SinDetector.class);

public static MatchMetaData detectSin(MatchMetaData data, String text) {
if (data.getColumnType().equals("INT") || data.getColumnType().equals("VARCHAR")) {
BiographicFunctions bf = new BiographicFunctions();
if ( ( text.matches("[0-9]+") && text.length() == 9) && bf.isValidSIN(text)) {
log.info("Valid SIN " + text);
data.setModel("sin");
data.setAverageProbability(1);
return data;
}
}
return null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
*
* Copyright 2014-2017, Armenak Grigoryan, and individual contributors as indicated
* by the @authors tag. See the copyright.txt in the distribution for a
* full listing of individual contributors.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*/
package com.strider.datadefender.traindatagen;

import com.strider.datadefender.extensions.BiographicFunctions;
import com.strider.datadefender.functions.CoreFunctions;
import java.io.PrintWriter;

/**
* Generate valid list of social insurance numbers and random text in a format
* acceptable for OpenNLP data model trainer.
*
* @author Armenak Grigoryan
*/
public class SINGenerator {

private static final String START_TAG = "<START:sin>";
private static final String END_TAG = "<END>";
private static final String SPACE = " ";
private static final int LINES = 20000;
private static final String RANDOM_FILE = "/Users/strider/work/strider/DataDefender/src/main/resources/lipsum.txt";
private static final String OUTOUT_FILE = "/Users/strider/work/strider/DataDefender/src/main/resources/sin.txt";


//<START:medicine> Augmentin-Duo <END> is a penicillin antibiotic that contains two medicines - <START:medicine> amoxicillin trihydrate <END>
// <START:medicine> potassium clavulanate <END>. They work together to kill certain types of bacteria and are used to treat certain types of bacterial infections
public static void main(String[] args) throws Exception {

BiographicFunctions bf = new BiographicFunctions();
CoreFunctions cf = new CoreFunctions();
PrintWriter writer = new PrintWriter(OUTOUT_FILE, "UTF-8");

for (int i=0; i<=LINES; i++) {
StringBuilder sb = new StringBuilder();
sb.append(START_TAG).append(SPACE).append(bf.randomStringSIN()).append(SPACE).append(END_TAG).append(SPACE).append(cf.randomStringFromFile(RANDOM_FILE)).append(SPACE);
sb.append(START_TAG).append(SPACE).append(bf.randomStringSIN()).append(SPACE).append(END_TAG).append(SPACE).append(cf.randomStringFromFile(RANDOM_FILE));
writer.println(sb.toString());
}
writer.flush();
}

}
3 changes: 2 additions & 1 deletion src/main/resources/datadiscovery.properties
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ date=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-date.
money=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-money.bin
organization=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-organization.bin
time=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-time.bin
sin=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/en-ner-sin.bin
names=/Users/armenak/work/strider/DataDefender/sample_projects/mysql/names.dict
limit=10000
#models=person,location,date,time,money
models=date
models=sin

0 comments on commit 0b76d55

Please sign in to comment.