-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_name_dictionary.rb
53 lines (44 loc) · 1.61 KB
/
create_name_dictionary.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
require_relative 'document'
require_relative 'type'
require 'csv'
include Document
include Ota
# - list of ota terms types
# - insert array values into csv
# - csv --> name, ota type
# - determine ota type via text analysis --> normalize text, strip, run index or scan
# - keep track of docs that have no ota terms type
# puts TOSDR_DOCUMENT_NAMES
file_path = 'name_dictionary.csv'
File.delete(file_path) if File.exist?(file_path)
CSV.open("name_dictionary.csv", "w", write_headers: true,
headers: %w[tosdr_document_name ota_terms_type]) do |csv|
TOSDR_DOCUMENT_NAMES.each do |name|
# puts 'DOCUMENT: ' + name
term_match = false
ota_term_type = 'no match'
TERMS_TYPES.each do |key|
values = TERMS_TYPE_KEY_WORDS[key.to_sym] || []
# puts 'LOOP: ' + key.to_s + ' -> ' + values.length.to_s
values.each do |v|
# puts 'TRY: ' + key.to_s + ' -> ' + v.to_s + ' on ' + name
matches = name.scan(/#{v}/)
term_match = matches.length.positive?
# puts 'term_match: ' + term_match.to_s
ota_term_type = key if term_match
# puts 'MATCH: ' + v if term_match
break if term_match
end
break if term_match
end
csv << [name, ota_term_type]
end
end
dictionary = CSV.read('name_dictionary.csv', headers: true, liberal_parsing: true)
no_matches = dictionary.find_all { |row| row['ota_terms_type'] == 'no match' }
puts 'NO MATCHES COUNT: ' + no_matches.count.to_s
file_path = 'no_matches.csv'
File.delete(file_path) if File.exist?(file_path)
CSV.open("no_matches.csv", "w", write_headers: true) do |csv|
no_matches.each { |row| csv << row }
end