-
Notifications
You must be signed in to change notification settings - Fork 1
/
vectorization.py
122 lines (98 loc) · 4.22 KB
/
vectorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
### This file contains the functions necessary to vectorize the ICD labels and text inputs
# Author: Guillaume De Roo
import numpy as np
import pandas as pd
import re
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Vectorize ICD codes
def vectorize_icd_string(x, code_list):
"""Takes a string with ICD codes and returns an array of the right of 0/1"""
r = []
for code in code_list:
if code in x: r.append(1)
else: r.append(0)
return np.asarray(r)
def vectorize_icd_column(df, col_name, code_list):
"""Takes a column and applies the """
r = df[col_name].apply(lambda x: vectorize_icd_string(x, code_list))
r = np.transpose(np.column_stack(r))
return r
# Clean Text
def clean_str(string):
"""Cleaning of notes"""
""" Cleaning from Guillaume """
string = string.lower()
string = string.replace("\n", " ") # remove the lines
string = re.sub("\[\*\*.*?\*\*\]", "", string) # remove the things inside the [** **]
string = re.sub("[^a-zA-Z0-9\ \']+", " ", string)
""" Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
#string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
#string = re.sub(r",", " , ", string)
#string = re.sub(r"!", " ! ", string)
#string = re.sub(r"\(", " \( ", string)
#string = re.sub(r"\)", " \) ", string)
#string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
""" Canonize numbers"""
string = re.sub(r"(\d+)", "DG", string)
return string.strip()
def clean_notes(df, col_name):
r = df[col_name].apply(lambda x: clean_str(x))
return r
# Vectorize and Pad notes Text
def vectorize_notes(col, MAX_NB_WORDS, verbose = True):
"""Takes a note column and encodes it into a series of integer
Also returns the dictionnary mapping the word to the integer"""
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(col)
data = tokenizer.texts_to_sequences(col)
note_length = [len(x) for x in data]
vocab = tokenizer.word_index
MAX_VOCAB = len(vocab)
if verbose:
print('Vocabulary size: %s' % MAX_VOCAB)
print('Average note length: %s' % np.mean(note_length))
print('Max note length: %s' % np.max(note_length))
return data, vocab, MAX_VOCAB
def pad_notes(data, MAX_SEQ_LENGTH):
data = pad_sequences(data, maxlen = MAX_SEQ_LENGTH)
return data, data.shape[1]
# Creates an embedding Matrix
# Based on https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
def embedding_matrix(f_name, dictionary, EMBEDDING_DIM, verbose = True, sigma = None):
"""Takes a pre-trained embedding and adapts it to the dictionary at hand
Words not found will be all-zeros in the matrix"""
# Dictionary of words from the pre trained embedding
pretrained_dict = {}
with open(f_name, 'r') as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
pretrained_dict[word] = coefs
# Default values for absent words
if sigma:
pretrained_matrix = sigma * np.random.rand(len(dictionary) + 1, EMBEDDING_DIM)
else:
pretrained_matrix = np.zeros((len(dictionary) + 1, EMBEDDING_DIM))
# Substitution of default values by pretrained values when applicable
for word, i in dictionary.items():
vector = pretrained_dict.get(word)
if vector is not None:
pretrained_matrix[i] = vector
if verbose:
print('Vocabulary in notes:', len(dictionary))
print('Vocabulary in original embedding:', len(pretrained_dict))
inter = list( set(dictionary.keys()) & set(pretrained_dict.keys()) )
print('Vocabulary intersection:', len(inter))
return pretrained_matrix, pretrained_dict