diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 55ad4d99..f506da81 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -45,6 +45,7 @@ mel_base: 'e' energy_smooth_width: 0.12 breathiness_smooth_width: 0.12 voicing_smooth_width: 0.12 +voicing_domain: 'mulaw' tension_smooth_width: 0.12 use_lang_id: false diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 1991dc4a..94ade55a 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -49,6 +49,7 @@ use_energy_embed: false use_breathiness_embed: false use_voicing_embed: false use_tension_embed: false +voicing_domain: 'mulaw' use_key_shift_embed: true use_speed_embed: true diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index ca5cd3a3..a8ca0af3 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -59,6 +59,7 @@ breathiness_db_max: -20.0 voicing_db_min: -96.0 voicing_db_max: -12.0 +voicing_domain: 'mulaw' tension_logit_min: -10.0 tension_logit_max: 10.0 diff --git a/configs/variance.yaml b/configs/variance.yaml index 10c84e88..8cf40445 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -82,6 +82,7 @@ breathiness_smooth_width: 0.12 voicing_db_min: -96.0 voicing_db_max: -12.0 voicing_smooth_width: 0.12 +voicing_domain: 'mulaw' tension_logit_min: -10.0 tension_logit_max: 10.0 diff --git a/modules/fastspeech/param_adaptor.py b/modules/fastspeech/param_adaptor.py index 77ebb833..e0af3088 100644 --- a/modules/fastspeech/param_adaptor.py +++ b/modules/fastspeech/param_adaptor.py @@ -49,7 +49,7 @@ def build_adaptor(self, cls=MultiVarianceDiffusion): if self.predict_voicing: ranges.append(( hparams['voicing_db_min'], - hparams['voicing_db_max'] + 0. if hparams.get('voicing_domain', 'db')=='mulaw' else hparams['voicing_db_max'] )) clamps.append((hparams['voicing_db_min'], 0.)) diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index 9301f14b..0a73882a 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -193,7 +193,7 @@ def process_item(self, item_name, meta_data, binarization_args): if self.need_voicing: # get ground truth voicing voicing = get_voicing( - dec_waveform, None, None, length=length + dec_waveform, None, None, length=length, domain=hparams.get('voicing_domain', 'db') ) global voicing_smooth diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 3d2990fe..8d7bec17 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -478,7 +478,7 @@ def process_item(self, item_name, meta_data, binarization_args): ) if voicing is None: voicing = get_voicing( - dec_waveform, None, None, length=length + dec_waveform, None, None, length=length, domain=hparams.get('voicing_domain', 'db') ) voicing_from_wav = True diff --git a/utils/binarizer_utils.py b/utils/binarizer_utils.py index df521642..77fd0397 100644 --- a/utils/binarizer_utils.py +++ b/utils/binarizer_utils.py @@ -79,14 +79,15 @@ def get_pitch_parselmouth( return f0, uv -def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db'): +def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db', mu=255.0): """ Definition of energy: RMS of the waveform, in dB representation :param waveform: [T] :param length: Expected number of frames :param hop_size: Frame width, in number of samples :param win_size: Window size, in number of samples - :param domain: db or amplitude + :param domain: 'db', 'amplitude', or 'mulaw' + :param mu: mu parameter for mu-law compression :return: energy """ energy = librosa.feature.rms(y=waveform, frame_length=win_size, hop_length=hop_size)[0] @@ -97,6 +98,10 @@ def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db'): energy = librosa.amplitude_to_db(energy) elif domain == 'amplitude': pass + elif domain == 'mulaw': + energy = np.log1p(mu * energy) / np.log1p(mu) + # Since modifications to the API have been frozen, this approach is adopted for compatibility. + energy = energy * 96 -96 else: raise ValueError(f'Invalid domain: {domain}') return energy @@ -134,7 +139,8 @@ def get_breathiness( def get_voicing( waveform: Union[np.ndarray, DecomposedWaveform], samplerate, f0, length, - *, hop_size=None, fft_size=None, win_size=None + *, hop_size=None, fft_size=None, win_size=None, + domain='db', mu=255.0 ): """ Definition of voicing: RMS of the harmonic part, in dB representation @@ -145,6 +151,8 @@ def get_voicing( :param hop_size: Frame width, in number of samples :param fft_size: Number of fft bins :param win_size: Window size, in number of samples + :param domain: 'db', 'amplitude', or 'mulaw' + :param mu: mu parameter for mu-law compression :return: voicing """ if not isinstance(waveform, DecomposedWaveform): @@ -155,7 +163,8 @@ def get_voicing( waveform_sp = waveform.harmonic() voicing = get_energy_librosa( waveform_sp, length=length, - hop_size=waveform.hop_size, win_size=waveform.win_size + hop_size=waveform.hop_size, win_size=waveform.win_size, + domain=domain, mu=mu ) return voicing