forked from x4nth055/emotion-recognition-using-speech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
110 lines (95 loc) · 3.52 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import soundfile
import librosa
import numpy as np
import pickle
AVAILABLE_EMOTIONS = {
"neutral",
"calm",
"happy",
"sad",
"angry",
"fear",
"disgust",
"ps", # pleasant surprised
"boredom"
}
def get_label(audio_config):
"""Returns label corresponding to which features are to be extracted
e.g:
audio_config = {'mfcc': True, 'chroma': True, 'contrast': False, 'tonnetz': False, 'mel': False}
get_label(audio_config): 'mfcc-chroma'
"""
features = ["mfcc", "chroma", "mel", "contrast", "tonnetz"]
label = ""
for feature in features:
if audio_config[feature]:
label += f"{feature}-"
return label.rstrip("-")
def get_dropout_str(dropout, n_layers=3):
if isinstance(dropout, list):
return "_".join([ str(d) for d in dropout])
elif isinstance(dropout, float):
return "_".join([ str(dropout) for i in range(n_layers) ])
def get_first_letters(emotions):
return "".join(sorted([ e[0].upper() for e in emotions ]))
def extract_feature(file_name, **kwargs):
"""
Extract feature from audio file `file_name`
Features supported:
- MFCC (mfcc)
- Chroma (chroma)
- MEL Spectrogram Frequency (mel)
- Contrast (contrast)
- Tonnetz (tonnetz)
e.g:
`features = extract_feature(path, mel=True, mfcc=True)`
"""
mfcc = kwargs.get("mfcc")
chroma = kwargs.get("chroma")
mel = kwargs.get("mel")
contrast = kwargs.get("contrast")
tonnetz = kwargs.get("tonnetz")
with soundfile.SoundFile(file_name) as sound_file:
X = sound_file.read(dtype="float32")
sample_rate = sound_file.samplerate
if chroma or contrast:
stft = np.abs(librosa.stft(X))
result = np.array([])
if mfcc:
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
result = np.hstack((result, mfccs))
if chroma:
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
result = np.hstack((result, chroma))
if mel:
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
result = np.hstack((result, mel))
if contrast:
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
result = np.hstack((result, contrast))
if tonnetz:
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
result = np.hstack((result, tonnetz))
return result
def get_best_estimators(classification):
"""
Loads the estimators that are pickled in `grid` folder
Note that if you want to use different or more estimators,
you can fine tune the parameters in `grid_search.py` script
and run it again ( may take hours )
"""
if classification:
return pickle.load(open("grid/best_classifiers.pickle", "rb"))
else:
return pickle.load(open("grid/best_regressors.pickle", "rb"))
def get_audio_config(features_list):
"""
Converts a list of features into a dictionary understandable by
`data_extractor.AudioExtractor` class
"""
audio_config = {'mfcc': False, 'chroma': False, 'mel': False, 'contrast': False, 'tonnetz': False}
for feature in features_list:
if feature not in audio_config:
raise TypeError(f"Feature passed: {feature} is not recognized.")
audio_config[feature] = True
return audio_config