-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_valid_pos_sentence_orders.py
113 lines (87 loc) · 3.5 KB
/
create_valid_pos_sentence_orders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
####
# Reads reference grammar and training data to generate a file with valid pos sentences
#
#
# IMPORTANT
#
# This script is written bad
# Your output will contain noise in the beginning
#####
import csv
import re
import os
import collections
import sys
import xml.etree.ElementTree as ET
import nltk
from nltk import word_tokenize
test_utterances_csv = "scst1_testData_textTask.csv"
utterances_csv = "sc2abc_sc1.csv"
reference_grammar = "referenceGrammar_v3.0.0.xml"
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
def read_grammar(reference_grammar):
tree = ET.parse(reference_grammar)
root = tree.getroot()
dictionary = { get_prompt(unit): get_responses(unit) for unit in root.findall('prompt_unit') }
return ( dictionary, dictionary.keys() )
def get_prompt(unit):
prompt = unit.find('prompt').text
return prompt
def get_responses(unit):
return [ response.text for response in unit.findall('response') ]
# ---------------------------------------------------
def read_and_process_spreadsheet(input_csv, transcription_row, correctness_row):
tokenized_all = set()
with open(input_csv, 'r', encoding="utf-8") as csv_infile:
reader = csv.reader(csv_infile, delimiter='\t', quotechar='"')
for row in reader:
# Skip header row
if ( not is_header_row(row) ):
process_spreadsheet_row(row, tokenized_all, transcription_row, correctness_row)
return tokenized_all
def is_header_row(row):
return ( row[0] == 'Id' )
def process_spreadsheet_row(row, tokenized_all, transcription_row, correctness_row):
#"Id" "Prompt" "Wavfile" "RecResult" "Transcription" "language" "meaning" "Trace"
transcription = row[transcription_row]
language_correct_gold_standard = row[correctness_row]
if language_correct_gold_standard:
tokenized_all.add(clean_and_tokenize(transcription))
def clean_and_tokenize(transcription):
tuples = nltk.pos_tag(word_tokenize(clean_kaldi_tags(transcription)))
if not tuples: return ""
words, tags = zip(*tuples)
return " ".join(tags)
def clean_kaldi_tags(rec_result):
words = rec_result.split(" ")
sentence = []
for word in words:
word = re.sub(r"([A-Za-zöäüÖÄÜ]+[*][v])", "", word)
word = re.sub(r"([A-Za-zöäüÖÄÜ]+[*][a])", "", word)
word = re.sub(r"([A-Za-zöäüÖÄÜ]+[*][x])", "", word)
word = re.sub(r"([*][z])", "", word)
word = re.sub(r"-xxx-", "", word)
word = re.sub(r"-xxx", "", word)
word = re.sub(r"xxx", "", word)
word = re.sub(r"ggg", "", word)
word = re.sub(r"ah", "" ,word)
if word:
sentence.append(word)
return " ".join(sentence)
# ---------------------------------------------------
def create_tokenized_grammar(reference_grammar, grammar_dic, known_prompts):
tokenized_all = set()
for prompt in known_prompts:
for response in grammar_dic[prompt]:
words, tags = zip(*nltk.pos_tag(word_tokenize(response)))
tokenized_all.add(" ".join(tags))
return tokenized_all
def do_all_processing():
( grammar_dic, known_prompts ) = read_grammar(reference_grammar)
tokenized_all = create_tokenized_grammar(reference_grammar, grammar_dic, known_prompts)
tokenized_all.update(read_and_process_spreadsheet(utterances_csv, 4, 5))
tokenized_all.update(read_and_process_spreadsheet(test_utterances_csv, 3, 4))
for sentence in tokenized_all:
print(sentence)
do_all_processing()