diff --git a/penn/data/preprocess/core.py b/penn/data/preprocess/core.py index 9600d7c..3eb5f34 100644 --- a/penn/data/preprocess/core.py +++ b/penn/data/preprocess/core.py @@ -272,46 +272,50 @@ def gset(): unvoiced = pitch == 0 voiced = ~unvoiced - pitch_list = np.vsplit(pitch, pitch.shape[0]) - pitch_list_final = [] + # FOR sampling rates like 11025, 22050, 44100, resampling isn't necessary + if GSET_SAMPLE_RATE / penn.SAMPLE_RATE % 1 != 0: + printf("Resampling to penn.SAMPLE_RATE") - voiced_list = np.vsplit(voiced, voiced.shape[0]) - voiced_list_final = [] + pitch_list = np.vsplit(pitch, pitch.shape[0]) + pitch_list_final = [] - for pitch_arr, voiced_arr in zip(pitch_list, voiced_list): - # Get target number of frames - frames = penn.convert.samples_to_frames(audio.shape[-1]) + voiced_list = np.vsplit(voiced, voiced.shape[0]) + voiced_list_final = [] - pitch_arr = pitch_arr[0, :] - voiced_arr = voiced_arr[0, :] + for pitch_arr, voiced_arr in zip(pitch_list, voiced_list): + # Get target number of frames + frames = penn.convert.samples_to_frames(audio.shape[-1]) - # Linearly interpolate to target number of frames - new_times = penn.HOPSIZE_SECONDS * np.arange(0, frames) - new_times += penn.HOPSIZE_SECONDS / 2. - pitch_arr = 2. ** np.interp(new_times, times, np.log2(pitch_arr)) + pitch_arr = pitch_arr[0, :] + voiced_arr = voiced_arr[0, :] - # Linearly interpolate voiced_arr/unvoiced_arr tokens - voiced_arr = np.interp(new_times, times, voiced_arr) > .5 + # Linearly interpolate to target number of frames + new_times = penn.HOPSIZE_SECONDS * np.arange(0, frames) + new_times += penn.HOPSIZE_SECONDS / 2. + pitch_arr = 2. ** np.interp(new_times, times, np.log2(pitch_arr)) - # Check shapes - assert ( - penn.convert.samples_to_frames(audio.shape[-1]) == - pitch_arr.shape[-1] == - voiced_arr.shape[-1]) + # Linearly interpolate voiced_arr/unvoiced_arr tokens + voiced_arr = np.interp(new_times, times, voiced_arr) > .5 - assert np.logical_not(pitch_arr[voiced_arr] == 0).all() + # Check shapes + assert ( + penn.convert.samples_to_frames(audio.shape[-1]) == + pitch_arr.shape[-1] == + voiced_arr.shape[-1]) - pitch_list_final.append(pitch_arr) - voiced_list_final.append(voiced_arr) + assert np.logical_not(pitch_arr[voiced_arr] == 0).all() - pitch = np.vstack(pitch_list_final) - voiced = np.vstack(voiced_list_final) + pitch_list_final.append(pitch_arr) + voiced_list_final.append(voiced_arr) - if pitch.shape[0] == 1: - pitch = pitch[0, :] + pitch = np.vstack(pitch_list_final) + voiced = np.vstack(voiced_list_final) - if voiced.shape[0] == 1: - voiced = voiced[0, :] + if pitch.shape[0] == 1: + pitch = pitch[0, :] + + if voiced.shape[0] == 1: + voiced = voiced[0, :] # Save to cache np.save(output_directory / f'{stem}-pitch.npy', pitch) @@ -559,13 +563,11 @@ def extract_pitch_array_jams(jam: jams.JAMS, track, uniform=True) -> Tuple[np.nd freq = np.array([pitch.value['frequency']]) # Don't keep track of zero or unvoiced frequencies - if np.sum(freq) == 0 or not pitch.value['voiced']: - freq = np.zeros(1) - - # Append the observation time - entry_times = np.append(entry_times, pitch.time) - # Append the frequency - slice_pitch_list.append(freq) + if np.sum(freq) != 0 and pitch.value['voiced']: + # Append the observation time + entry_times = np.append(entry_times, pitch.time) + # Append the frequency + slice_pitch_list.append(freq) # Sort the pitch list before resampling just in case it is not already sorted entry_times, slice_pitch_array = sort_pitch_list(entry_times, slice_pitch_list)