-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprep-sectionizers.sh
executable file
·105 lines (84 loc) · 3.91 KB
/
prep-sectionizers.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/bin/zsh
## Most, if not all, of these environment variables will need to be
## customized to match your running environment.
export SECTIONIZER_DIR=/Users/pmh/git/ots-clinical-sectionizer
export SECTIONIZER_CONDA=~/opt/anaconda3/envs/sections-py3.8
export ENSEMBLE_DIR=/Users/pmh/git/ots-ensemble-systems
export ENSEMBLE_CONDA=~/opt/anaconda3/envs/ensemble-py3.8
export ETUDE_DIR=/Users/pmh/git/etude
export ETUDE_CONDA=~/opt/anaconda3/envs/etude-py3.7
export CONFIG_DIR=/Users/pmh/git/etude-engine-configs
export RESULT_DIR=/Users/pmh/git/ots-ensemble-systems/data/out3
export TASK=sectionizing
export RESULT_FILE=/Users/pmh/git/ots-ensemble-systems/data/out2/${TASK}/${TASK}_results.csv
export TEXTRACTOR_DIR='/Users/pmh/Box Sync/TBIC/Sectionizing/Ref2'
export BIGODM_DIR='/Users/pmh/data/sections/2014_i2b2_challenge_with_dai_normalized'
export MEDSPACY_DIR='/Users/pmh/data/sections/data/medspacy_out'
export CLASSIFIER_OUT="${RESULT_DIR}/classifiers"
export MERGED_OUT="${RESULT_DIR}/merged"
mkdir -p "${CLASSIFIER_OUT}"
mkdir -p "${MERGED_OUT}"
export METHOD=voting
export SYS_DIR=${RESULT_DIR}/${METHOD}
mkdir -p ${SYS_DIR}
mkdir -p ${RESULT_DIR}/etude
###########################################################
## Run individual classifiers
## Rules-based medspaCy sectionizer
${SECTIONIZER_CONDA}/bin/python3 \
"${SECTIONIZER_DIR}/medspacy/medspaCy_sectionizer.py" \
--types-dir /Users/pmh/git/ots-ensemble-systems/types \
--input-dir "${TEXTRACTOR_DIR}/brat5" \
--output-dir "${CLASSIFIER_OUT}/rules_brat5"
## SVM-based sectionizer using spaCy and scikit-learn
${SECTIONIZER_CONDA}/bin/python3 \
"${SECTIONIZER_DIR}/svm-spacy/spaCY-SVM_sectionizer.py" \
--types-dir /Users/pmh/git/ots-ensemble-systems/types \
--input-dir "${TEXTRACTOR_DIR}/brat5" \
--output-dir "${CLASSIFIER_OUT}/svm_brat5_rbf" \
--model-dir "${MEDSPACY_DIR}_svm_brat5_rbf" \
--svm-kernel "rbf"
###########################################################
## Merge the oracle/reference annotation with the above classifiers to
## generate a single input corpus for the meta-classifier ensemble
## system to read in
${ENSEMBLE_CONDA}/bin/python3 \
${ENSEMBLE_DIR}/medspaCy/sectionizers-converter.py \
--types-dir /Users/pmh/git/ots-ensemble-systems/types \
--input-ref-dir "${TEXTRACTOR_DIR}/brat5" \
--input-sharpn-systems "${CLASSIFIER_OUT}" \
--output-dir "${MERGED_OUT}"
###########################################################
## Run voting meta-classifier
export CLASSIFIERS=12
export MINVOTES=1
## Simple voting ensemble system
${ENSEMBLE_CONDA}/bin/python3 \
${ENSEMBLE_DIR}/medspaCy/voting-ensemble.py \
--types-dir /Users/pmh/git/ots-ensemble-systems/types \
--file-list /tmp/sample.txt \
--input-dir "${MERGED_OUT}" \
--voting-unit sentence \
--classifier-list ${CLASSIFIERS} \
--min-vote ${MINVOTES} \
--zero-strategy drop \
--output-dir ${SYS_DIR}
###########################################################
## Score voting ensemble output
export MATCH_FLAG=partial
${ETUDE_CONDA}/bin/python3 ${ETUDE_DIR}/etude.py \
--reference-conf ${CONFIG_DIR}/brat/sections_textractor_brat.conf \
--reference-input "${TEXTRACTOR_DIR}/brat5" \
--test-conf ${CONFIG_DIR}/uima/sections_note-nlp_xmi.conf \
--test-input ${SYS_DIR} \
--score-key Parent \
--score-value Header \
--file-suffix ".ann" ".xmi" \
--fuzzy-match-flags ${MATCH_FLAG} \
--metrics Accuracy TP FP FN TN Recall Precision F1 \
> ${RESULT_DIR}/etude/${METHOD}_${MINVOTES}_${CLASSIFIERS}.log
## TODO - update grepping of stats for easy summary csv
#export COVERAGE=`grep micro ${RESULT_DIR}/etude/${METHOD}_${MINVOTES}_${CLASSIFIERS}.log | cut -f 2 | head -n 1 | tr '\n' '\t'`
#export ACCURACY=`grep micro ${RESULT_DIR}/etude/${METHOD}_${MINVOTES}_${CLASSIFIERS}.log | cut -f 2 | tail -n 1 | tr '\n' '\t'`
#echo "${METHOD} ${CLASSIFIERS} ${ACCURACY}${COVERAGE}${MINVOTES}" \
# >> ${RESULT_FILE}