-
Notifications
You must be signed in to change notification settings - Fork 0
/
step03SelectingNeologisms.py
74 lines (73 loc) · 6.35 KB
/
step03SelectingNeologisms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python3
#el output de epitran no es una lista sino una cadena, por ello era necesario tokenisar para evitar símbolos extraños
#simplemente aplicando tokenise fue suficiente para resolver el problema
#el input de este programa es un directorio con archivos .tsv de cuatro columnas y el output es un archivo .tsv
#de seven columns: contador por file,english,filename,idioma,word,IPA tokenised and IPA raw
#SE RESOLVIO EL index ERROR añadiendo una condición en:
#if tokenise(pron)[-1] not in NoAlFinalPalabra:
#se resolvió el VALUE ERROR: not enough values to unpack (expected 4, got 1) verificando que len(row)=4
#dealing with aymara morphological rules
from ipatok import tokenise
import os
import csv
#I added blank spaces and guion just in case I find a way to show them in the output, by now tokenise suppress them
IPAaymara=["","-","ɶ","ä","ɒ̈","ɒ","æ","ɐ","ɞ̞","œ","ɞ","ɑ","ɜ","ʌ","ə","ø̞","a","y","ɨ","ɪ","i","e","ɛ","e̞","E","ʏ","ɘ","u","o","ʊ","ɔ","ɤ̞","ɤ","ɯ","ɵ","ʊ̈","ʉ","ø","ɪ̈","o̞","t͡ʃ'","t͡ʃʼ","t͡ʃ","t͡ʃʰ","h","k","kʰ","k'","kʼ","l","ʎ","m","n","ɲ","ŋ","p","pʰ","p'","pʼ","q","qʰ","q'","qʼ","ɾ","s","t","tʰ","t'","tʼ","χ","w","W","j"]
NoAlFinalPalabra=["ɲ","k","kʰ","k'","p","pʰ","p'","q","qʰ","q'","t","tʰ","t'","t͡ʃ'","t͡ʃʼ","t͡ʃ","t͡ʃʰ"]
NotAtSyllableEnd=["ɲ","h","kʰ","k'","kʼ","pʰ","p'","pʼ","qʼ","qʰ","q'","tʼ","tʰ","t'","t͡ʃ'","t͡ʃʼ","t͡ʃʰ"]
alofonos=["i","ʏ","ɘ","ɛ","e","o","ɔ","e̞","E","o̞"]
uvular=["q","qʼ","q'","qʰ","χ"]
vowels=["ɶ","ä","ɒ̈","ɒ","æ","ɐ","ɞ̞","œ","ɞ","ɑ","ɜ","ʌ","ə","ø̞","a","y","ɨ","ɪ","i","e","ɛ","e̞","E","ʏ","ɘ","u","o","ʊ","ɔ","ɤ̞","ɤ","ɯ","ɵ","ʊ̈","ʉ","ø","ɪ̈","o̞"]
vowelsaymara=["æ","ɑ","a","ɪ","i","e","ɛ","u","o","ʊ","ɔ"] #it includes 3 vowels and their usual alophones
consonants=["t͡ʃ'","t͡ʃʼ","t͡ʃ","t͡ʃʰ","h","k","kʰ","k'","kʼ","l","ʎ","m","n","ɲ","ŋ","p","pʰ","p'","pʼ","q","qʰ","q'","qʼ","ɾ","s","t","tʰ","t'","tʼ","χ","w","W","j"]
directory="/home/alonso/PalabrasTraducidasTranscritasIPA/"
#directory='/media/alonso/3361-6630/neologisms/GoogleAPIcloudEpitran/'
for filename in sorted(os.listdir(directory)):
with open(directory+filename,'r') as csvfile:
reader = csv.reader(csvfile,delimiter='\t')
count=0
for row in reader:
if len(row) == 4:
index, english, word, pron = row
#I put the clause just below to avoid ValueError: The string starts with a tie bar: '͡'
if "'͡" not in pron:
#verificar que la palabra examinada solamente contenga fonemas aymaras
if all([char in IPAaymara for char in tokenise(pron)]):
#verificar que no hay palabras que empiecen con r
if len(tokenise(pron)) > 0 and tokenise(pron)[0] != "ɾ":
#verificar que no haya consonantes al final de palabra
if len(tokenise(pron)) > 0 and tokenise(pron)[-1] not in consonants:
with open("filetsvTranslatedbyGoogleIPAbyEpitranTokenisedbyIPAtokSelectedNeologisms4.tsv", "a") as f:
#si la palabra NO tiene sonidos alofonos, no hace falta verificas si tiene uvulares
if all([char not in alofonos for char in tokenise(pron)]):
for i in range(len(tokenise(pron)) - 1): # i is the current index
#verificar si la palabra tiene diptongos o hiatos
if tokenise(pron)[i + 1] in vowels and tokenise(pron)[i] in vowels:
break
else:
if i == len(tokenise(pron))-2:
for i in range(len(tokenise(pron)) - 1): # i is the current index
#verificar si la palabra tiene consonantes consecutivas
if tokenise(pron)[i + 1] in consonants and tokenise(pron)[i] in NotAtSyllableEnd:
break
else:
if i == len(tokenise(pron))-2:
count+=1
f.write(f"{count}\t{english}\t{filename}\t{filename[13:15]}\t{word}\t{tokenise(pron)}\t{pron}\n")
#print(count,word,pron)
else:
#la palabra tiene alofonos, verificar si tiene uvulares
if any([char in uvular for char in tokenise(pron)]):
#la palabra tiene alofonos y uvulares, verificar si tiene diptongos
for i in range(len(tokenise(pron)) - 1): # i is the current index
if tokenise(pron)[i + 1] in vowels and tokenise(pron)[i] in vowels:
break
else:
if i == len(tokenise(pron))-2:
for i in range(len(tokenise(pron)) - 1): # i is the current index
#verificar si la palabra tiene consonantes consecutivas
if tokenise(pron)[i + 1] in consonants and tokenise(pron)[i] in NotAtSyllableEnd:
break
else:
if i == len(tokenise(pron))-2:
count+=1
f.write(f"{count}\t{english}\t{filename}\t{filename[13:15]}\t{word}\t{tokenise(pron)}\t{pron}\n")