Interests

Projects

Education

Ph.D. in Computer Science, Department of Computer Science and Engineering, Texas A&M University, 2015-present
B.S. in Applied Physics, Department of Physics, University of Science and Technology of China, 2011-2015
Minor in Computer Science, School of Computer Science and Technology, University of Science and Technology of China, 2014-2015

@article{zhao-2019-taslp,
title = {Using Phonetic Posteriorgram Based Frame Pairing for Segmental Accent Conversion},
author = {G Zhao and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2019/07/zhao2019taslp.pdf},
doi = {10.1109/TASLP.2019.2926754},
issn = {2329-9290},
year = {2019},
date = {2019-07-04},
journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
volume = {27},
number = {10},
pages = {1649-1660},
abstract = {Accent conversion (AC) aims to transform non-native utterances to sound as if the speaker had a native accent. This can be achieved by mapping source speech spectra from a native speaker into the acoustic space of the target non-native speaker. In prior work, we proposed an AC approach that matches frames between the two speakers based on their acoustic similarity after compensating for differences in vocal tract length. In this paper, we propose a new approach that matches frames between the two speakers based on their phonetic (rather than acoustic) similarity. Namely, we map frames from the two speakers into a phonetic posteriorgram using speaker-independent acoustic models trained on native speech. We thoroughly evaluate the approach on a speech corpus containing multiple native and non-native speakers. The proposed algorithm outperforms the prior approach, improving ratings of acoustic quality (22% increase in mean opinion score) and native accent (69% preference) while retaining the voice quality of the non-native speaker. Furthermore, we show that the approach can be used in the reverse conversion direction, i.e., generating speech with a native speaker's voice quality and a non-native accent. Finally, we show that this approach can be applied to non-parallel training data, achieving the same accent conversion performance.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}

Accent conversion (AC) aims to transform non-native utterances to sound as if the speaker had a native accent. This can be achieved by mapping source speech spectra from a native speaker into the acoustic space of the target non-native speaker. In prior work, we proposed an AC approach that matches frames between the two speakers based on their acoustic similarity after compensating for differences in vocal tract length. In this paper, we propose a new approach that matches frames between the two speakers based on their phonetic (rather than acoustic) similarity. Namely, we map frames from the two speakers into a phonetic posteriorgram using speaker-independent acoustic models trained on native speech. We thoroughly evaluate the approach on a speech corpus containing multiple native and non-native speakers. The proposed algorithm outperforms the prior approach, improving ratings of acoustic quality (22% increase in mean opinion score) and native accent (69% preference) while retaining the voice quality of the non-native speaker. Furthermore, we show that the approach can be used in the reverse conversion direction, i.e., generating speech with a native speaker's voice quality and a non-native accent. Finally, we show that this approach can be applied to non-parallel training data, achieving the same accent conversion performance.

@inproceedings{liberatore2018icassp,
title = {Voice Conversion through Residual Warping in a Sparse, Anchor-Based Representation of Speech},
author = {C Liberatore and G Zhao and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/03/liberatore-icassp2018.pdf},
year = {2018},
date = {2018-04-15},
booktitle = {Proc. ICASSP},
abstract = {In previous work we presented a Sparse, Anchor-Based Representation of speech (SABR) that uses phonemic “anchors” to represent an utterance with a set of sparse non-negative weights. SABR is speaker-independent: combining weights from a source speaker with anchors from a target speaker can be used for voice conversion. Here, we present an extension of the original SABR that significantly improves voice conversion synthesis. Namely, we take the residual signal from the SABR decomposition of the source speaker’s utterance, and warp it to the target speaker’s space using a weighted warping function learned from pairs of source-target
anchors. Using subjective and objective evaluations, we examine the performance of adding the warped residual (SABR+Res) to the original synthesis (SABR). Specifically, listeners rated SABR+Res with an average mean opinion score (MOS) of 3.6, a significant improvement compared to 2.2 MOS for SABR alone (𝑝 < 0.01) and
2.5 MOS for a baseline GMM method (𝑝 < 0.01). In an XAB speaker identity test, listeners correctly identified the identity of SABR+Res (81%) and SABR (84%) as requently as a GMM method (82%) (𝑝 = 0.70, 𝑝 = 0.35). These results indicate that adding the warped residual can dramatically improve synthesis while retaining the
desirable independent qualities of SABR models.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

In previous work we presented a Sparse, Anchor-Based Representation of speech (SABR) that uses phonemic “anchors” to represent an utterance with a set of sparse non-negative weights. SABR is speaker-independent: combining weights from a source speaker with anchors from a target speaker can be used for voice conversion. Here, we present an extension of the original SABR that significantly improves voice conversion synthesis. Namely, we take the residual signal from the SABR decomposition of the source speaker’s utterance, and warp it to the target speaker’s space using a weighted warping function learned from pairs of source-target
anchors. Using subjective and objective evaluations, we examine the performance of adding the warped residual (SABR+Res) to the original synthesis (SABR). Specifically, listeners rated SABR+Res with an average mean opinion score (MOS) of 3.6, a significant improvement compared to 2.2 MOS for SABR alone (𝑝 < 0.01) and
2.5 MOS for a baseline GMM method (𝑝 < 0.01). In an XAB speaker identity test, listeners correctly identified the identity of SABR+Res (81%) and SABR (84%) as requently as a GMM method (82%) (𝑝 = 0.70, 𝑝 = 0.35). These results indicate that adding the warped residual can dramatically improve synthesis while retaining the
desirable independent qualities of SABR models.