-
Notifications
You must be signed in to change notification settings - Fork 24
/
tokenize_tiers_in_TextGrid.praat
189 lines (170 loc) · 6.95 KB
/
tokenize_tiers_in_TextGrid.praat
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# Tokenize utterance tiers in a Praat TextGrid file
# (Slightly adapted for ELFA)
#
# This script makes copies of all the tiers in each TextGrid in a given directory and tokenizes the annotations in each
# original tier to the corresponding new word tier.
#
# 9.4.2012
# Mietta Lennes
#
# Ask the user for the input directory
form Tokenize the utterance tiers in TextGrid files
text Directory
endform
# Read the list of files in the given directory
Create Strings as file list... grids 'directory$'*.TextGrid
files = Get number of strings
echo 'files' TextGrid files found in directory 'directory$'.'newline$'
# Loop through all the files:
for file to files
file$ = Get string... file
Read from file... 'directory$''file$'
printline Tokenizing: 'file$'
gridname$ = selected$ ("TextGrid", 1)
gridfile$ = directory$ + gridname$ + ".TextGrid"
# Initialize some variables
total_duration = Get total duration
stringlength = 0
oldlabel$ = ""
newlabel$ = ""
word_end = 0
tier = 1
starting_interval = 1
overwrite = 1
# Copy the original transcript tiers into new word tiers, which will be named originalname-word
select TextGrid 'gridname$'
numberOfTiers = Get number of tiers
speakerTiers = 0
for t to numberOfTiers
tier$ = Get tier name... t
speakerTiers = speakerTiers + 1
original_tier_'speakerTiers' = t
t = t + 1
new_tier$ = "'tier$'-word"
Duplicate tier... original_tier_'speakerTiers' t 'new_tier$'
endif
numberOfTiers = Get number of tiers
endfor
numberOfSpeakers = speakerTiers
# Loop through all the tiers in the TextGrid and process each pair of original tier + word tier:
for t from 1 to numberOfTiers
original_tier = t
word_tier = t + 1
numberOfUtteranceIntervals = Get number of intervals... original_tier
# Loop through all utterance intervals for this speaker and tokenize them
for utt to numberOfUtteranceIntervals
utterance$ = Get label of interval... original_tier utt
if utterance$ <> ""
words = 0
# Remove any trailing spaces:
while right$ (utterance$,1 ) = " "
utterance$ = left$(utterance$, length(utterance$)-1)
endwhile
# Remove any leading spaces:
while left$ (utterance$,1 ) = " "
utterance$ = right$(utterance$, length(utterance$)-1)
endwhile
tmp_utterance$ = utterance$
if index (utterance$, " ") = 0
words = 1
word_'words'$ = utterance$
tmp_utterance$ = ""
endif
# Divide the utterance into words at (one or more) space characters
while index(tmp_utterance$," ") > 0 or length(tmp_utterance$) > 0
words = words + 1
word_'words'$ = left$(tmp_utterance$, index(tmp_utterance$, " ")-1)
if index(tmp_utterance$, " ") > 0
tmp_utterance$ = right$(tmp_utterance$, length(tmp_utterance$)-index(tmp_utterance$, " "))
while left$ (tmp_utterance$,1 ) = " "
tmp_utterance$ = right$(tmp_utterance$, length(tmp_utterance$)-1)
endwhile
else
word_'words'$ = tmp_utterance$
tmp_utterance$ = ""
endif
endwhile
# Count the total number of characters ("duration units") in each word and in the whole utterance,
# excluding the special overlap markers {} in ELFA:
utterance_length = 0
for word to words
word_'word'_length = 0
for char to length(word_'word'$)
if mid$(word_'word'$,char,1) <> "{" and mid$(word_'word'$,char,1) <> "}"
utterance_length = utterance_length + 1
word_'word'_length = word_'word'_length + 1
endif
endfor
endfor
utterance_start = Get starting point... original_tier utt
utterance_end = Get end point... original_tier utt
# ELFA modification:
# Check for errors where the next utterance starts before the current utterance ends in the same utterance tier
# (typical for ELFA where the original tiers have been created automatically from a non-time-aligned text file):
if utt < numberOfUtteranceIntervals
next_utt = utt + 1
next_utterance_start = Get starting point... original_tier next_utt
next_utterance$ = Get label of interval... original_tier next_utt
# Remove the utterance end that occurs erroneously after the next utterance start:
if next_utterance_start < utterance_end
Remove right boundary... original_tier utt
Insert boundary... original_tier next_utterance_start
Set interval text... original_tier utt 'utterance$'
Set interval text... original_tier next_utt 'next_utterance$'
numberOfUtteranceIntervals = Get number of intervals... original_tier
utterance_start = Get starting point... original_tier utt
utterance_end = Get end point... original_tier utt
# Do the same fix for the word tier where the errors have been copied:
word_interval = Get interval at time... word_tier utterance_start
word_start = Get starting point... word_tier word_interval
word_end = Get end point... word_tier word_interval
Remove right boundary... word_tier word_interval
Insert boundary... word_tier next_utterance_start
Set interval text... word_tier word_interval 'utterance$'
next_word_interval = word_interval + 1
Set interval text... word_tier next_word_interval 'next_utterance$'
# This should fix it! - ML 9.4.2012
endif
endif
# Divide the duration of the utterance by the number of characters, which was counted before:
utterance_dur = utterance_end - utterance_start
dur_unit = utterance_dur / utterance_length
word_interval = Get interval at time... word_tier utterance_start
word_start = Get starting point... word_tier word_interval
# Insert the starting boundary for the first word, if required:
if word_start < utterance_start
Insert boundary... word_tier utterance_start
word_interval = Get interval at time... word_tier utterance_start
word_start = Get starting point... word_tier word_interval
endif
# Insert the end boundary for each word in the utterance, according to the number of characters:
for word to words
# In case the word is not empty, insert the text and the end boundary:
if word_'word'_length > 0
word$ = word_'word'$
Set interval text... word_tier word_interval 'word$'
word_end = word_start + (word_'word'_length * dur_unit)
# The boundary will be added only in case there is no end boundary already:
if word_end < total_duration and word_end < utterance_end
Insert boundary... word_tier word_end
endif
word_start = word_end
word_interval = Get interval at time... word_tier word_start
endif
endfor
endif
endfor
# Move on to the next speaker (= the next utterance tier or original transcript tier)
t = t + 1
endfor
# Save this TextGrid file and remove the object from cluttering the Object window
select TextGrid 'gridname$'
Write to text file... 'gridfile$'
Remove
# and continue with the next TexctGrid file in the directory.
select Strings grids
endfor
# Finally, remove the Strings object.
Remove
# Done!
printline ... Done.