-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into inf_build
- Loading branch information
Showing
123 changed files
with
1,289 additions
and
29,322 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# A Recipe for the AMI corpus. | ||
|
||
"The AMI Meeting Corpus consists of 100 hours of meeting recordings. The recordings use a range of signals synchronized to a common timeline. These include close-talking and far-field microphones, individual and room-view video cameras, and output from a slide projector and an electronic whiteboard. During the meetings, the participants also have unsynchronized pens available to them that record what is written. The meetings were recorded in English using three different rooms with different acoustic properties, and include mostly non-native speakers." See http://groups.inf.ed.ac.uk/ami/corpus/overview.shtml for more details. | ||
|
||
We use the individual headset microphone (IHM) setting for preparing train, dev and test sets. The recipe here is heavily inspired from the preprocessing scripts in Kaldi - https://github.com/kaldi-asr/kaldi/tree/master/egs/ami . | ||
|
||
## Steps to download and prepare the audio and text data | ||
|
||
Prepare train, dev and test sets as list files to be used for training with wav2letter. Replace `[...]` with appropriate paths | ||
|
||
``` | ||
python prepare.py -dst [...] | ||
``` | ||
|
||
The above scripts download the AMI data, segments them into shorter `.flac` audio files based on word timestamps. Limited supervision training set for 10min, 1hr and 10hr will be generated as well. | ||
|
||
The following structure will be generated | ||
``` | ||
>tree -L 4 | ||
. | ||
├── audio | ||
│ ├── EN2001a | ||
│ │ ├── EN2001a.Headset-0.wav | ||
│ │ ├── ... | ||
│ │ └── EN2001a.Headset-4.wav | ||
│ ├── EN2001b | ||
│ ├── ... | ||
│ ├── ... | ||
│ ├── IS1009d | ||
│ │ ├── ... | ||
│ │ └── IS1009d.Headset-3.wav | ||
│ └── segments | ||
│ ├── ES2005a | ||
│ │ ├── ES2005a_H00_MEE018_0.75_1.61.flac | ||
│ │ ├── ES2005a_H00_MEE018_13.19_16.05.flac | ||
│ │ ├── ... | ||
│ │ └── ... | ||
│ ├── ... | ||
│ └── IS1009d | ||
│ ├── ... | ||
│ └── ... | ||
├── lists | ||
│ ├── dev.lst | ||
│ ├── test.lst | ||
│ ├── train_10min_0.lst | ||
│ ├── train_10min_1.lst | ||
│ ├── train_10min_2.lst | ||
│ ├── train_10min_3.lst | ||
│ ├── train_10min_4.lst | ||
│ ├── train_10min_5.lst | ||
│ ├── train_9hr.lst | ||
│ └── train.lst | ||
│ | ||
└── text | ||
├── ami_public_manual_1.6.1.zip | ||
└── annotations | ||
├── 00README_MANUAL.txt | ||
├── ... | ||
├── transcripts0 | ||
├── transcripts1 | ||
├── transcripts2 | ||
├── words | ||
└── youUsages | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,218 @@ | ||
#!/usr/bin/env perl | ||
|
||
# Copyright 2014 University of Edinburgh (Author: Pawel Swietojanski) | ||
|
||
# The script - based on punctuation times - splits segments longer than #words (input parameter) | ||
# and produces bit more more normalised form of transcripts, as follows | ||
# MeetID Channel Spkr stime etime transcripts | ||
|
||
#use List::MoreUtils 'indexes'; | ||
use strict; | ||
use warnings; | ||
|
||
sub split_transcripts; | ||
sub normalise_transcripts; | ||
|
||
sub merge_hashes { | ||
my ($h1, $h2) = @_; | ||
my %hash1 = %$h1; my %hash2 = %$h2; | ||
foreach my $key2 ( keys %hash2 ) { | ||
if( exists $hash1{$key2} ) { | ||
warn "Key [$key2] is in both hashes!"; | ||
next; | ||
} else { | ||
$hash1{$key2} = $hash2{$key2}; | ||
} | ||
} | ||
return %hash1; | ||
} | ||
|
||
sub print_hash { | ||
my ($h) = @_; | ||
my %hash = %$h; | ||
foreach my $k (sort keys %hash) { | ||
print "$k : $hash{$k}\n"; | ||
} | ||
} | ||
|
||
sub get_name { | ||
#no warnings; | ||
my $sname = sprintf("%07d_%07d", $_[0]*100, $_[1]*100) || die 'Input undefined!'; | ||
#use warnings; | ||
return $sname; | ||
} | ||
|
||
sub split_on_comma { | ||
|
||
my ($text, $comma_times, $btime, $etime, $max_words_per_seg)= @_; | ||
my %comma_hash = %$comma_times; | ||
|
||
print "Btime, Etime : $btime, $etime\n"; | ||
|
||
my $stime = ($etime+$btime)/2; #split time | ||
my $skey = ""; | ||
my $otime = $btime; | ||
foreach my $k (sort {$comma_hash{$a} cmp $comma_hash{$b} } keys %comma_hash) { | ||
print "Key : $k : $comma_hash{$k}\n"; | ||
my $ktime = $comma_hash{$k}; | ||
if ($ktime==$btime) { next; } | ||
if ($ktime==$etime) { last; } | ||
if (abs($stime-$ktime)/2<abs($stime-$otime)/2) { | ||
$otime = $ktime; | ||
$skey = $k; | ||
} | ||
} | ||
|
||
my %transcripts = (); | ||
|
||
if (!($skey =~ /[\,][0-9]+/)) { | ||
print "Cannot split into less than $max_words_per_seg words! Leaving : $text\n"; | ||
$transcripts{get_name($btime, $etime)}=$text; | ||
return %transcripts; | ||
} | ||
|
||
print "Splitting $text on $skey at time $otime (stime is $stime)\n"; | ||
my @utts1 = split(/$skey\s+/, $text); | ||
for (my $i=0; $i<=$#utts1; $i++) { | ||
my $st = $btime; | ||
my $et = $comma_hash{$skey}; | ||
if ($i>0) { | ||
$st=$comma_hash{$skey}; | ||
$et = $etime; | ||
} | ||
my (@utts) = split (' ', $utts1[$i]); | ||
if ($#utts < $max_words_per_seg) { | ||
my $nm = get_name($st, $et); | ||
print "SplittedOnComma[$i]: $nm : $utts1[$i]\n"; | ||
$transcripts{$nm} = $utts1[$i]; | ||
} else { | ||
print 'Continue splitting!'; | ||
my %transcripts2 = split_on_comma($utts1[$i], \%comma_hash, $st, $et, $max_words_per_seg); | ||
%transcripts = merge_hashes(\%transcripts, \%transcripts2); | ||
} | ||
} | ||
return %transcripts; | ||
} | ||
|
||
sub split_transcripts { | ||
@_ == 4 || die 'split_transcripts: transcript btime etime max_word_per_seg'; | ||
|
||
my ($text, $btime, $etime, $max_words_per_seg) = @_; | ||
my (@transcript) = @$text; | ||
|
||
my (@punct_indices) = grep { $transcript[$_] =~ /^[\.,\?\!\:]$/ } 0..$#transcript; | ||
my (@time_indices) = grep { $transcript[$_] =~ /^[0-9]+\.[0-9]*/ } 0..$#transcript; | ||
my (@puncts_times) = delete @transcript[@time_indices]; | ||
my (@puncts) = @transcript[@punct_indices]; | ||
|
||
if ($#puncts_times != $#puncts) { | ||
print 'Ooops, different number of punctuation signs and timestamps! Skipping.'; | ||
return (); | ||
} | ||
|
||
#first split on full stops | ||
my (@full_stop_indices) = grep { $puncts[$_] =~ /[\.\?]/ } 0..$#puncts; | ||
my (@full_stop_times) = @puncts_times[@full_stop_indices]; | ||
|
||
unshift (@full_stop_times, $btime); | ||
push (@full_stop_times, $etime); | ||
|
||
my %comma_puncts = (); | ||
for (my $i=0, my $j=0;$i<=$#punct_indices; $i++) { | ||
my $lbl = "$transcript[$punct_indices[$i]]$j"; | ||
if ($lbl =~ /[\.\?].+/) { next; } | ||
$transcript[$punct_indices[$i]] = $lbl; | ||
$comma_puncts{$lbl} = $puncts_times[$i]; | ||
$j++; | ||
} | ||
|
||
#print_hash(\%comma_puncts); | ||
|
||
print "InpTrans : @transcript\n"; | ||
print "Full stops: @full_stop_times\n"; | ||
|
||
my @utts1 = split (/[\.\?]/, uc join(' ', @transcript)); | ||
my %transcripts = (); | ||
for (my $i=0; $i<=$#utts1; $i++) { | ||
my (@utts) = split (' ', $utts1[$i]); | ||
if ($#utts < $max_words_per_seg) { | ||
print "ReadyTrans: $utts1[$i]\n"; | ||
$transcripts{get_name($full_stop_times[$i], $full_stop_times[$i+1])} = $utts1[$i]; | ||
} else { | ||
print "TransToSplit: $utts1[$i]\n"; | ||
my %transcripts2 = split_on_comma($utts1[$i], \%comma_puncts, $full_stop_times[$i], $full_stop_times[$i+1], $max_words_per_seg); | ||
print "Hash TR2:\n"; print_hash(\%transcripts2); | ||
print "Hash TR:\n"; print_hash(\%transcripts); | ||
%transcripts = merge_hashes(\%transcripts, \%transcripts2); | ||
print "Hash TR_NEW : \n"; print_hash(\%transcripts); | ||
} | ||
} | ||
return %transcripts; | ||
} | ||
|
||
sub normalise_transcripts { | ||
my $text = $_[0]; | ||
|
||
#DO SOME ROUGH AND OBVIOUS PRELIMINARY NORMALISATION, AS FOLLOWS | ||
#remove the remaining punctation labels e.g. some text ,0 some text ,1 | ||
$text =~ s/[\.\,\?\!\:][0-9]+//g; | ||
#there are some extra spurious puncations without spaces, e.g. UM,I, replace with space | ||
$text =~ s/[A-Z']+,[A-Z']+/ /g; | ||
#split words combination, ie. ANTI-TRUST to ANTI TRUST (None of them appears in cmudict anyway) | ||
#$text =~ s/(.*)([A-Z])\s+(\-)(.*)/$1$2$3$4/g; | ||
$text =~ s/\-/ /g; | ||
#substitute X_M_L with X. M. L. etc. | ||
$text =~ s/\_/. /g; | ||
#normalise and trim spaces | ||
$text =~ s/^\s*//g; | ||
$text =~ s/\s*$//g; | ||
$text =~ s/\s+/ /g; | ||
#some transcripts are empty with -, nullify (and ignore) them | ||
$text =~ s/^\-$//g; | ||
$text =~ s/\s+\-$//; | ||
# apply few exception for dashed phrases, Mm-Hmm, Uh-Huh, etc. those are frequent in AMI | ||
# and will be added to dictionary | ||
$text =~ s/MM HMM/MM\-HMM/g; | ||
$text =~ s/UH HUH/UH\-HUH/g; | ||
|
||
return $text; | ||
} | ||
|
||
if (@ARGV != 2) { | ||
print STDERR "Usage: ami_split_segments.pl <meet-file> <out-file>\n"; | ||
exit(1); | ||
} | ||
|
||
my $meet_file = shift @ARGV; | ||
my $out_file = shift @ARGV; | ||
my %transcripts = (); | ||
|
||
open(W, ">$out_file") || die "opening output file $out_file"; | ||
open(S, "<$meet_file") || die "opening meeting file $meet_file"; | ||
|
||
while(<S>) { | ||
|
||
my @A = split(" ", $_); | ||
if (@A < 9) { print "Skipping line @A"; next; } | ||
|
||
my ($meet_id, $channel, $spk, $channel2, $trans_btime, $trans_etime, $aut_btime, $aut_etime) = @A[0..7]; | ||
my @transcript = @A[8..$#A]; | ||
my %transcript = split_transcripts(\@transcript, $aut_btime, $aut_etime, 30); | ||
|
||
for my $key (keys %transcript) { | ||
my $value = $transcript{$key}; | ||
my $segment = normalise_transcripts($value); | ||
my @times = split(/\_/, $key); | ||
if ($times[0] >= $times[1]) { | ||
print "Warning, $meet_id, $spk, $times[0] > $times[1]. Skipping. \n"; next; | ||
} | ||
if (length($segment)>0) { | ||
print W join " ", $meet_id, "H0${channel2}", $spk, $times[0]/100.0, $times[1]/100.0, $segment, "\n"; | ||
} | ||
} | ||
|
||
} | ||
close(S); | ||
close(W); | ||
|
||
print STDERR "Finished." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Copyright, University of Edinburgh (Pawel Swietojanski and Jonathan Kilgour) | ||
|
||
if [ $# -ne 1 ]; then | ||
echo "Usage: $0 <ami-dir>" | ||
exit 1; | ||
fi | ||
|
||
adir=$1 | ||
wdir=$1/annotations | ||
|
||
[ ! -f $adir/annotations/AMI-metadata.xml ] && echo "$0: File $adir/annotations/AMI-metadata.xml no found." && exit 1; | ||
|
||
mkdir -p $wdir/log | ||
|
||
JAVA_VER=$(java -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; 1q') | ||
|
||
if [ "$JAVA_VER" -ge 15 ]; then | ||
if [ ! -d $wdir/nxt ]; then | ||
echo "Downloading NXT annotation tool..." | ||
wget -O $wdir/nxt.zip http://sourceforge.net/projects/nite/files/nite/nxt_1.4.4/nxt_1.4.4.zip | ||
[ ! -s $wdir/nxt.zip ] && echo "Downloading failed! ($wdir/nxt.zip)" && exit 1 | ||
unzip -d $wdir/nxt $wdir/nxt.zip &> /dev/null | ||
fi | ||
|
||
if [ ! -f $wdir/transcripts0 ]; then | ||
echo "Parsing XML files (can take several minutes)..." | ||
nxtlib=$wdir/nxt/lib | ||
java -cp $nxtlib/nxt.jar:$nxtlib/xmlParserAPIs.jar:$nxtlib/xalan.jar:$nxtlib \ | ||
FunctionQuery -c $adir/annotations/AMI-metadata.xml -q '($s segment)(exists $w1 w):$s^$w1' -atts obs who \ | ||
'@extract(($sp speaker)($m meeting):$m@observation=$s@obs && $m^$sp & $s@who==$sp@nxt_agent,global_name, 0)'\ | ||
'@extract(($sp speaker)($m meeting):$m@observation=$s@obs && $m^$sp & $s@who==$sp@nxt_agent, channel, 0)' \ | ||
transcriber_start transcriber_end starttime endtime '$s' '@extract(($w w):$s^$w & $w@punc="true", starttime,0,0)' \ | ||
1> $wdir/transcripts0 2> $wdir/log/nxt_export.log | ||
fi | ||
else | ||
echo "$0. Java not found. Will download exported version of transcripts." | ||
annots=ami_manual_annotations_v1.6.1_export | ||
wget -O $wdir/$annots.gzip http://groups.inf.ed.ac.uk/ami/AMICorpusAnnotations/$annots.gzip | ||
gunzip -c $wdir/${annots}.gzip > $wdir/transcripts0 | ||
fi | ||
|
||
#remove NXT logs dumped to stdio | ||
grep -e '^Found' -e '^Obs' -i -v $wdir/transcripts0 > $wdir/transcripts1 | ||
|
||
exit 0; |
Oops, something went wrong.